From 7a58f64e533e0c8a72c198e460c6741aaad100d8 Mon Sep 17 00:00:00 2001 From: Peifeng Yu Date: Mon, 18 Mar 2019 13:57:28 -0400 Subject: [PATCH 01/13] Support multi-device jobs --- scripts/Note | 10 ++++ .../tensorflow/device/gpu/lane/lanemgr.cpp | 45 ++++++++++---- src/oplibraries/tensorflow/tfinstance.cpp | 60 +++++++++++++------ tests/test_tf/test_mnist_tf.py | 4 ++ tests/test_tf/test_seq.py | 2 + tests/test_tf/test_super_res.py | 2 + tests/test_tf/test_vae.py | 2 + tests/test_tf/test_vgg.py | 2 + 8 files changed, 96 insertions(+), 31 deletions(-) diff --git a/scripts/Note b/scripts/Note index 39b1651..3b949d4 100644 --- a/scripts/Note +++ b/scripts/Note @@ -9,3 +9,13 @@ Without aggregate mode, metrics shows up in CUPTI_ACTIVITY_KIND_METRIC table, ot The value field of these table is blob, which can be parsed as double tail -f server.output | egrep -e 'OpItem ExecTask' | egrep --color=always -e 'failures=[[:digit:]]+' -e '[[:digit:]]+ ms' + +bazel test $(bazel query --keep_going 'let base = //tensorflow/python/kernel_tests/...:all in kind(test, $base) intersect attr(tags, "cuda-py-test", $base)') + +nvprof --aggregate-mode off --metrics sm_efficiency --csv + +nvprof --aggregate-mode off --metrics sm_efficiency python test.py +nvprof --print-gpu-trace python test.py + +All tests: +bazel query --keep_going 'let base = //tensorflow/python/kernel_tests/...:all in kind(test, $base) intersect attr(tags, "cuda-py-test", $base)' | sed 's#//#bazel-bin/#;s#:#/#' diff --git a/src/oplibraries/tensorflow/device/gpu/lane/lanemgr.cpp b/src/oplibraries/tensorflow/device/gpu/lane/lanemgr.cpp index 3bfadc7..75e4c7a 100644 --- a/src/oplibraries/tensorflow/device/gpu/lane/lanemgr.cpp +++ b/src/oplibraries/tensorflow/device/gpu/lane/lanemgr.cpp @@ -26,6 +26,9 @@ #include "utils/envutils.h" #include "utils/threadutils.h" +#include +#include + namespace tfgpu = perftools::gputools; namespace salus::oplib::tensorflow { @@ -123,30 +126,48 @@ void LaneMgr::processRequests() processRequests(sstl::with_guard(m_mu)); } -void LaneMgr::processRequests(sstl::detail::Guard &&g) +void LaneMgr::processRequests(sstl::detail::Guard &&) { - UNUSED(g); - auto it = m_pending.begin(); auto end = m_pending.end(); while (it != end) { auto &req = *it; + const auto reqLen = req.layout.memoryLimits.size(); - // TODO: the algorithm below assumes single GPU, to scale to multiple ones, a global lock is needed - CHECK_EQ(req.layout.memoryLimits.size(), 1_sz) << "Only single lane layout is supported"; + CHECK_LE(reqLen, m_gpus.size()) << "Requested more GPU than available"; + + // use a greedy algorithm, sort requested layout in desc order, and try to fit the largest one first + std::vector indices(reqLen); + std::iota(indices.begin(), indices.end(), 0); + std::sort(indices.begin(), indices.end(), [&req](const int a, const int b) { + if (req.layout.memoryLimits.at(a) == req.layout.memoryLimits.at(b)) { + return req.layout.persistentOccupation.at(a) < req.layout.persistentOccupation.at(b); + } + return req.layout.memoryLimits.at(a) < req.layout.memoryLimits.at(b); + }); std::vector> lanes; - auto &gcb = m_gpus.at(0); + for (auto idx : indices) { + std::shared_ptr lane{nullptr}; + for (auto &gcb : m_gpus) { + lane = gcb.bestFitFor(req.layout.memoryLimits.at(idx), req.layout.persistentOccupation.at(idx)); + if (lane) { + break; + } + } + if (!lane) { + // can't find a suitable allocation + break; + } + lanes.emplace_back(std::move(lane)); + } - // using a best fit policy - const auto firstIdx = 0; - auto lane = gcb.bestFitFor(req.layout.memoryLimits.at(firstIdx), req.layout.persistentOccupation.at(firstIdx)); - if (!lane) { - // can't find a suitable allocation. + if (lanes.size() != reqLen) { + // no enough lanes + lanes.clear(); ++it; continue; } - lanes.emplace_back(std::move(lane)); req.cb(std::move(lanes)); diff --git a/src/oplibraries/tensorflow/tfinstance.cpp b/src/oplibraries/tensorflow/tfinstance.cpp index 5638f7b..eef26de 100644 --- a/src/oplibraries/tensorflow/tfinstance.cpp +++ b/src/oplibraries/tensorflow/tfinstance.cpp @@ -91,28 +91,50 @@ void TFInstance::handleCreateSession(std::unique_ptr & LaneMgr::Layout layout; // Get resource estimation from client - constexpr const auto rt = "MEMORY:GPU"; - const auto totalGPUMemory = m_laneMgr->totalMemoryForGPU(0); - size_t limit = 0; - size_t persistant = 0; + constexpr const char *rt[] = { + "MEMORY:GPU0", + "MEMORY:GPU1", + "MEMORY:GPU2", + "MEMORY:GPU3", + "MEMORY:GPU4", + nullptr, + }; auto &m = req->config().salus_options().resource_map(); - persistant = static_cast(std::round(sstl::getOrDefault(m.persistant(), rt, 0.0))); - // HACK: scale up 10% to mitigate OOM and fragmentation - persistant = static_cast(persistant * 1.1); - limit += persistant; - limit += static_cast(std::round(sstl::getOrDefault(m.temporary(), rt, 0.0))); - - // HACK: Double the persistant and add to to temporary, just to be safe - limit = static_cast(limit * 1.05); // and even more 10% - limit = std::min(limit, totalGPUMemory); // cap to max value - - if (limit == 0) { - limit = totalGPUMemory; - persistant = limit; + for (auto iGpu = 0_sz; iGpu != m_laneMgr->numGPUs(); ++iGpu) { + const auto totalGPUMemory = m_laneMgr->totalMemoryForGPU(iGpu); + + CHECK_NOTNULL(rt[iGpu]) << "We need more GPU strings"; + + size_t limit = 0; + size_t persistant = 0; + auto p = sstl::optionalGet(m.persistant(), rt[iGpu]); + auto t = sstl::optionalGet(m.temporary(), rt[iGpu]); + if (!p || !t) { + break; + } + persistant = static_cast(std::round(*p)); + // HACK: scale persistent up 10% to mitigate OOM and fragmentation + persistant = static_cast(persistant * 1.1); + limit += persistant; + + limit += static_cast(std::round(*t)); + + // HACK: scale the total up 5%, just to be safe + limit = static_cast(limit * 1.05); // and even more 10% + limit = std::min(limit, totalGPUMemory); // cap to max value + + layout.memoryLimits.push_back(limit); + layout.persistentOccupation.push_back(persistant); + } + + if (layout.memoryLimits.empty()) { + auto limit = m_laneMgr->totalMemoryForGPU(0); + layout.memoryLimits.push_back(limit); + layout.persistentOccupation.push_back(limit); LOG(WARNING) << "No resource info for current session, assuming whole GPU allocation: " << limit; } - layout.memoryLimits.push_back(limit); - layout.persistentOccupation.push_back(persistant); + + CHECK_EQ(layout.memoryLimits.size(), layout.persistentOccupation.size()); auto totalRunningTime = static_cast(std::round(sstl::getOrDefault(m.persistant(), "TIME:TOTAL", 0.0))) * 1000; diff --git a/tests/test_tf/test_mnist_tf.py b/tests/test_tf/test_mnist_tf.py index 43239de..b652595 100644 --- a/tests/test_tf/test_mnist_tf.py +++ b/tests/test_tf/test_mnist_tf.py @@ -293,6 +293,8 @@ def _config(self, **kwargs): config.allow_soft_placement = True config.salus_options.resource_map.temporary['MEMORY:GPU'] = memusages[batch_size][0] config.salus_options.resource_map.persistant['MEMORY:GPU'] = memusages[batch_size][1] + config.salus_options.resource_map.temporary['MEMORY:GPU0'] = memusages[batch_size][0] + config.salus_options.resource_map.persistant['MEMORY:GPU0'] = memusages[batch_size][1] return config @@ -313,6 +315,8 @@ def _config(self, **kwargs): config.allow_soft_placement = True config.salus_options.resource_map.temporary['MEMORY:GPU'] = memusages[batch_size][0] config.salus_options.resource_map.persistant['MEMORY:GPU'] = memusages[batch_size][1] + config.salus_options.resource_map.temporary['MEMORY:GPU0'] = memusages[batch_size][0] + config.salus_options.resource_map.persistant['MEMORY:GPU0'] = memusages[batch_size][1] return config diff --git a/tests/test_tf/test_seq.py b/tests/test_tf/test_seq.py index 5d40663..64edbbb 100644 --- a/tests/test_tf/test_seq.py +++ b/tests/test_tf/test_seq.py @@ -192,6 +192,8 @@ def _config(self, model_size, isEval=False): config.allow_soft_placement = True config.salus_options.resource_map.temporary['MEMORY:GPU'] = memusages[model_size][0] config.salus_options.resource_map.persistant['MEMORY:GPU'] = memusages[model_size][1] + config.salus_options.resource_map.temporary['MEMORY:GPU0'] = memusages[model_size][0] + config.salus_options.resource_map.persistant['MEMORY:GPU0'] = memusages[model_size][1] return config diff --git a/tests/test_tf/test_super_res.py b/tests/test_tf/test_super_res.py index fd15c1b..075c276 100644 --- a/tests/test_tf/test_super_res.py +++ b/tests/test_tf/test_super_res.py @@ -95,6 +95,8 @@ def _config(self, isEval=False, **kwargs): config.allow_soft_placement = True config.salus_options.resource_map.temporary['MEMORY:GPU'] = memusages[batch_size][0] config.salus_options.resource_map.persistant['MEMORY:GPU'] = memusages[batch_size][1] + config.salus_options.resource_map.temporary['MEMORY:GPU0'] = memusages[batch_size][0] + config.salus_options.resource_map.persistant['MEMORY:GPU0'] = memusages[batch_size][1] return config def _get_func(self, batch_size, isEval=False): diff --git a/tests/test_tf/test_vae.py b/tests/test_tf/test_vae.py index 466c5b0..59085e6 100644 --- a/tests/test_tf/test_vae.py +++ b/tests/test_tf/test_vae.py @@ -124,6 +124,8 @@ def _config(self, args, isEval=False): config.allow_soft_placement = True config.salus_options.resource_map.temporary['MEMORY:GPU'] = memusages[args.batch_size][0] config.salus_options.resource_map.persistant['MEMORY:GPU'] = memusages[args.batch_size][1] + config.salus_options.resource_map.temporary['MEMORY:GPU0'] = memusages[args.batch_size][0] + config.salus_options.resource_map.persistant['MEMORY:GPU0'] = memusages[args.batch_size][1] return config @parameterized.expand([(1,), (5,), (10,)]) diff --git a/tests/test_tf/test_vgg.py b/tests/test_tf/test_vgg.py index 0502121..7cc7206 100644 --- a/tests/test_tf/test_vgg.py +++ b/tests/test_tf/test_vgg.py @@ -166,6 +166,8 @@ def _config(self, **kwargs): config.allow_soft_placement = True config.salus_options.resource_map.temporary['MEMORY:GPU'] = memusages[batch_size][0] config.salus_options.resource_map.persistant['MEMORY:GPU'] = memusages[batch_size][1] + config.salus_options.resource_map.temporary['MEMORY:GPU0'] = memusages[batch_size][0] + config.salus_options.resource_map.persistant['MEMORY:GPU0'] = memusages[batch_size][1] return config From 0b12cc0b4cd4b3c9ade256178d62fdad7225f23e Mon Sep 17 00:00:00 2001 From: Peifeng Yu Date: Sun, 24 Mar 2019 15:13:32 -0400 Subject: [PATCH 02/13] Fix test_mnist_correctness --- tests/test_tf/test_mnist_tf.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_tf/test_mnist_tf.py b/tests/test_tf/test_mnist_tf.py index b652595..bcdd15b 100644 --- a/tests/test_tf/test_mnist_tf.py +++ b/tests/test_tf/test_mnist_tf.py @@ -266,8 +266,9 @@ def test_distributed(self, batch_size): dev='/job:tfworker/device:GPU:0', config=self._config(batch_size=batch_size)) - def test_correctness(self): - actual, expected = run_on_rpc_and_gpu(self._runner(), config=self._config()) + @parameterized.expand([(25,), (50,), (100,)]) + def test_correctness(self, batch_size): + actual, expected = run_on_rpc_and_gpu(self._runner(batch_size), config=self._config()) assertAllClose(actual, expected, rtol=1e-3) From 9a32989b40e5122e89e68851b8049c31c6f1dc8e Mon Sep 17 00:00:00 2001 From: Peifeng Yu Date: Tue, 2 Apr 2019 20:56:04 -0400 Subject: [PATCH 03/13] Support blocking iteration kernel launch according to SM usage --- .clang-format | 4 - CMakeLists.txt | 6 +- cmake/Defaults.cmake | 2 +- src/CMakeLists.txt | 14 ++ src/cudahook/CMakeLists.txt | 32 +++ src/cudahook/cudahook.cpp | 166 ++++++++++++++ src/cudahook/cudahook.h | 85 +++++++ src/cudahook/functions.def | 43 ++++ src/cudahook/kernellaunches.cpp | 209 ++++++++++++++++++ src/cudahook/kernellaunches.h | 139 ++++++++++++ src/cudahook/realdlsym.cpp | 45 ++++ src/cudahook/realdlsym.h | 43 ++++ src/execution/engine/taskexecutor.cpp | 3 + src/execution/executionengine.cpp | 2 + .../threadpool/nonblockingthreadpool.cpp | 3 + src/oplibraries/tensorflow/tfinstance.cpp | 1 + src/oplibraries/tensorflow/v3/smblocker.cpp | 157 +++++++++++++ src/oplibraries/tensorflow/v3/smblocker.h | 75 +++++++ src/oplibraries/tensorflow/v3/tf_executor.cpp | 13 ++ src/platform/CMakeLists.txt | 12 +- src/platform/posix/thread_annotations.cpp | 40 ++++ src/platform/thread_annotations.h | 7 + src/rpcserver/iothreadpool.cpp | 2 + src/salus-server.list | 3 + src/utils/threadutils.cpp | 10 +- src/utils/threadutils.h | 11 +- 26 files changed, 1115 insertions(+), 12 deletions(-) create mode 100644 src/cudahook/CMakeLists.txt create mode 100644 src/cudahook/cudahook.cpp create mode 100644 src/cudahook/cudahook.h create mode 100644 src/cudahook/functions.def create mode 100644 src/cudahook/kernellaunches.cpp create mode 100644 src/cudahook/kernellaunches.h create mode 100644 src/cudahook/realdlsym.cpp create mode 100644 src/cudahook/realdlsym.h create mode 100644 src/oplibraries/tensorflow/v3/smblocker.cpp create mode 100644 src/oplibraries/tensorflow/v3/smblocker.h create mode 100644 src/platform/posix/thread_annotations.cpp create mode 100644 src/salus-server.list diff --git a/.clang-format b/.clang-format index 8bef8b5..ba2855d 100644 --- a/.clang-format +++ b/.clang-format @@ -96,10 +96,6 @@ PenaltyBreakString: 1000 PenaltyExcessCharacter: 1000 PenaltyReturnTypeOnItsOwnLine: 10000 PointerAlignment: Right -RawStringFormats: - - Delimiter: pb - Language: TextProto - BasedOnStyle: google ReflowComments: true SortIncludes: true SortUsingDeclarations: true diff --git a/CMakeLists.txt b/CMakeLists.txt index e1bb1a0..9169cba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,7 @@ # Version 3.8 required from CheckCXXFeature.cmake # Version 3.10 required from Boost 1.66.0 for imported target -cmake_minimum_required(VERSION 3.10.0) +# Version 3.13 required for target_link_options +cmake_minimum_required(VERSION 3.13.0) project(executor VERSION 1.0.0 LANGUAGES C CXX) @@ -84,6 +85,9 @@ endif(WITH_TCMALLOC) find_package(nlohmann_json) set_package_properties(nlohmann_json PROPERTIES TYPE OPTIONAL PURPOSE "For OpTracing logging") +set(THREADS_PREFER_PTHREAD_FLAG) +find_package(Threads) + # Bundled third party library add_subdirectory(thirdparty) #--------------------------------------------------------------------------------------- diff --git a/cmake/Defaults.cmake b/cmake/Defaults.cmake index 2335e10..99b553c 100644 --- a/cmake/Defaults.cmake +++ b/cmake/Defaults.cmake @@ -12,7 +12,7 @@ endif() list(APPEND CMAKE_PREFIX_PATH spack-packages) list(REMOVE_DUPLICATES CMAKE_PREFIX_PATH) -# Use C++14 standard +# Use C++17 standard set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c4a350d..4fa7123 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -62,6 +62,7 @@ if(USE_TENSORFLOW) "oplibraries/tensorflow/v3/sigraphmgr.cpp" "oplibraries/tensorflow/v3/tf_executor.cpp" + "oplibraries/tensorflow/v3/smblocker.cpp" "oplibraries/tensorflow/device/shadowdevices.cpp" "oplibraries/tensorflow/device/salusdevices.cpp" @@ -96,6 +97,14 @@ if(USE_TENSORFLOW) ) endif(USE_TENSORFLOW) +target_link_options(salus-server + PRIVATE + "LINKER:--dynamic-list=${CMAKE_CURRENT_SOURCE_DIR}/salus-server.list" +) +set_target_properties(salus-server PROPERTIES + LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/salus-server.list +) + #--------------------------------------------------------------------------------------- # Instrucment #--------------------------------------------------------------------------------------- @@ -107,6 +116,11 @@ elseif(WITH_TCMALLOC) target_link_libraries(salus-server gperftools::tcmalloc) endif() +#--------------------------------------------------------------------------------------- +# CUDA Hooker +#--------------------------------------------------------------------------------------- +add_subdirectory(cudahook) + #--------------------------------------------------------------------------------------- # Installation #--------------------------------------------------------------------------------------- diff --git a/src/cudahook/CMakeLists.txt b/src/cudahook/CMakeLists.txt new file mode 100644 index 0000000..a83369a --- /dev/null +++ b/src/cudahook/CMakeLists.txt @@ -0,0 +1,32 @@ +set(SRC_LIST + cudahook.cpp + realdlsym.cpp + kernellaunches.cpp) + +add_library(cudahook SHARED ${SRC_LIST}) + +set_target_properties(cudahook PROPERTIES + VISIBILITY_INLINES_HIDDEN 1 + CXX_VISIBILITY_PRESET hidden +) + +include(GenerateExportHeader) +generate_export_header(cudahook) +target_include_directories(cudahook + PUBLIC + ${CMAKE_CURRENT_BINARY_DIR} +) + +target_compile_definitions(cudahook + PRIVATE + _GNU_SOURCE=1 +) + +target_link_libraries(cudahook + PRIVATE + ${CMAKE_DL_LIBS} +) + +install(TARGETS cudahook + LIBRARY DESTINATION lib +) diff --git a/src/cudahook/cudahook.cpp b/src/cudahook/cudahook.cpp new file mode 100644 index 0000000..f0b2f68 --- /dev/null +++ b/src/cudahook/cudahook.cpp @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2019, peifeng + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "cudahook.h" +#include "cudahook_export.h" + +#include "realdlsym.h" + +#include + +#include +#include +#include + +using salus::real_dlsym; + +/* + * We need to give the pre-processor a chance to replace a function, such as: + * cuMemAlloc => cuMemAlloc_v2 + */ +#define STRINGIFY(x) #x +#define CUDA_SYMBOL_STRING(x) STRINGIFY(x) + +namespace salus { + +CudaHook::CudaHook(const char *dl) +{ + // Load the libcuda.so library with RTLD_GLOBAL so we can hook the function calls + m_handle = dlopen(dl, RTLD_LAZY | RTLD_GLOBAL); + if (!m_handle) { + std::cerr << "Error to open library " << dl << ": " << dlerror() << std::endl; + std::exit(-1); + } + +#define USE_FUNC(funcname, ret, params, ...) \ + m_orig.funcname = func_cast(real_dlsym(m_handle, CUDA_SYMBOL_STRING(funcname))); \ + if (!m_orig.funcname) { \ + std::cerr << "Error to find symbol " CUDA_SYMBOL_STRING(funcname) ": " << dlerror() << std::endl; \ + std::exit(-2); \ + } +#include "functions.def" + + auto envDebug = std::getenv("CUDA_HOOK_DEBUG"); + if (envDebug && envDebug[0] == '1') { + m_debugging = true; + std::cerr << "CUDA HOOK Library loaded." << std::endl; + } +} + +CudaHook &CudaHook::instance() +{ + static CudaHook hook("libcuda.so"); + return hook; +} + +CudaHook::~CudaHook() { + if (m_handle) { + dlclose(m_handle); + } +} + +struct HookAccessor +{ + const salus::CudaHook &hook; + + bool debugging() const + { + return hook.m_debugging; + } + + const auto &orig() const + { + return hook.m_orig; + } + + const auto &pre() const + { + return hook.m_pre; + } + + const auto &post() const + { + return hook.m_post; + } +}; + +} // namespace salus + + +/* + * Interposed Functions + */ +extern "C" { + +/* + * Other interposed functions + */ +#define USE_FUNC(funcname, ret, params, ...) \ + CUDAHOOK_EXPORT int funcname params \ + { \ + const salus::HookAccessor hook{salus::CudaHook::instance()}; \ + if (hook.pre().funcname) { \ + hook.pre().funcname(__VA_ARGS__); \ + } \ + if (hook.debugging()) { \ + std::cerr << "Hooked function " CUDA_SYMBOL_STRING(funcname) " is called\n";\ + } \ + auto res = hook.orig().funcname(__VA_ARGS__); \ + if (hook.post().funcname) { \ + hook.post().funcname(__VA_ARGS__); \ + } \ + return res; \ + } +#include "functions.def" + +/* + * We need to interpose dlsym since anyone using dlopen+dlsym to get the CUDA driver symbols will bypass + * the hooking mechanism (this includes the CUDA runtime). Its tricky though, since if we replace the + * real dlsym with ours, we can't dlsym() the real dlsym. To get around that, call the 'private' + * libc interface called __libc_dlsym to get the real dlsym. + */ +CUDAHOOK_EXPORT void* dlsym(void *handle, const char *symbol) noexcept +{ + // Early out if not a CUDA driver symbol + if (strncmp(symbol, "cu", 2) != 0) { + return real_dlsym(handle, symbol); + } + + const salus::HookAccessor hook{salus::CudaHook::instance()}; + if (hook.debugging()) { + std::cerr << "Hooked dlsym: requesting " << symbol << "\n"; + } + +#define USE_FUNC(funcname, ret, params, ...) \ + if (strcmp(symbol, CUDA_SYMBOL_STRING(funcname)) == 0) { \ + return reinterpret_cast(&funcname);\ + } +#include "functions.def" + + return real_dlsym(handle, symbol); +} + +} // extern "C" diff --git a/src/cudahook/cudahook.h b/src/cudahook/cudahook.h new file mode 100644 index 0000000..c6cf2b7 --- /dev/null +++ b/src/cudahook/cudahook.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2019, peifeng + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SALUS_CUDAHOOK_H +#define SALUS_CUDAHOOK_H + +#include + +namespace salus { + +struct HookAccessor; + +/* + * Typedefs of function types + */ +#define USE_FUNC(funcname, ret, params, ...) using Fn_##funcname = ret params; +#include "functions.def" + +/** + * @brief Callback structure for each of hooked function + */ +struct HookedFunctions +{ +#define USE_FUNC(funcname, ret, params, ...) \ + std::function funcname = nullptr; +#include "functions.def" +}; + +class CudaHook +{ + HookedFunctions m_orig; + + HookedFunctions m_pre; + HookedFunctions m_post; + + void *m_handle = nullptr; + + bool m_debugging = false; + + explicit CudaHook(const char *dl); + + friend struct HookAccessor; + +public: + ~CudaHook(); + + static CudaHook &instance(); + + HookedFunctions &post() + { + return m_post; + } + + HookedFunctions &pre() + { + return m_pre; + } +}; + +} // salus + +#endif // SALUS_CUDAHOOK_H diff --git a/src/cudahook/functions.def b/src/cudahook/functions.def new file mode 100644 index 0000000..027256b --- /dev/null +++ b/src/cudahook/functions.def @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2019, peifeng + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef USE_FUNC +#error USE_FUNC(funcname, ret, params, ...) should be defined before including hooked_functions.def +#endif + +USE_FUNC(cuLaunch, int, (void* f), f) +USE_FUNC(cuLaunchGrid, int, (void* f, int grid_width, int grid_height), f, grid_width, grid_height) +USE_FUNC(cuLaunchGridAsync, int, (void* f, int grid_width, int grid_height, void* stream), f, grid_width, grid_height, stream) +USE_FUNC(cuFuncSetBlockShape, int, (void* f, int x, int y, int z), f, x, y, z) +USE_FUNC(cuFuncSetSharedSize, int, (void* f, unsigned int bytes), f, bytes) + +USE_FUNC(cuLaunchKernel, int, (void* f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, + unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, + unsigned int sharedMemBytes, void* hStream, void** kernelParams, void** extra), + f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, + sharedMemBytes, hStream, kernelParams, extra) + +#undef USE_FUNC diff --git a/src/cudahook/kernellaunches.cpp b/src/cudahook/kernellaunches.cpp new file mode 100644 index 0000000..bb4481a --- /dev/null +++ b/src/cudahook/kernellaunches.cpp @@ -0,0 +1,209 @@ +/* + * Copyright (c) 2019, peifeng + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "kernellaunches.h" + +#include "realdlsym.h" +#include "cudahook.h" + +#include + +#include +#include + +namespace { +} // namespace + +namespace salus { + +KernelLaunches kl [[maybe_unused]]; + +KernelLaunches::KernelLaunches() noexcept +{ + // get callback function + m_selfHandle = dlopen(nullptr, RTLD_LAZY); + if (!m_selfHandle) { + std::cerr << "Error to get handle to self executable: " << dlerror() << std::endl; + std::exit(-4); + } + m_kernelLaunchCallback = func_cast(real_dlsym(m_selfHandle, KernelLaunchCallbackFuncationName)); + if (!m_kernelLaunchCallback) { + std::cerr << "Error to find symbol " << KernelLaunchCallbackFuncationName << ": " << dlerror() << std::endl; + std::exit(-5); + } + DetectorCuLaunchKernel::setCallback(m_kernelLaunchCallback); + DetectorCuLaunch::setCallback(m_kernelLaunchCallback); + + // install hooks to detect kernel launches + DetectorCuLaunchKernel::installHooks(); + DetectorCuLaunch::installHooks(); + + // debug + auto envDebug = std::getenv("CUDA_HOOK_DEBUG"); + if (envDebug && envDebug[0] == '1') { + m_debugging = true; + std::cerr << "CUDA Kernel launch recording started." << std::endl; + } +} + +// --------------------------------------------------------------------------------------------------------------------- +// cuLaunchKernel detector +// --------------------------------------------------------------------------------------------------------------------- + +FnKernelLaunchCallback *DetectorCuLaunchKernel::m_callback = nullptr; + +void DetectorCuLaunchKernel::installHooks() +{ + CudaHook::instance().pre().cuLaunchKernel = [](auto, auto gridX, auto gridY, auto gridZ, + auto blkX, auto blkY, auto blkZ, + auto shdMem, auto stream, auto, auto) { + auto &detector = localInstance(); + detector.onCuLaunchKernel({gridX, gridY, gridZ, blkX, blkY, blkZ, shdMem, stream}); + return 0; + }; +} + +DetectorCuLaunchKernel &DetectorCuLaunchKernel::localInstance() +{ + static thread_local DetectorCuLaunchKernel detector; + return detector; +} + +void DetectorCuLaunchKernel::onCuLaunchKernel(details::KernelParams params) +{ + m_kernelParams = params; + m_state = State::Found; + fire(); +} + +void DetectorCuLaunchKernel::fire() +{ + if (m_state != State::Found) { + return; + } + if (m_callback) { + m_callback(m_kernelParams.gridX, m_kernelParams.gridY, m_kernelParams.gridZ, + m_kernelParams.blkX, m_kernelParams.blkY, m_kernelParams.blkZ, + m_kernelParams.shdMem, m_kernelParams.stream); + } + m_state = State::Idle; +} + +// --------------------------------------------------------------------------------------------------------------------- +// cuLaunch detector +// --------------------------------------------------------------------------------------------------------------------- + +FnKernelLaunchCallback *DetectorCuLaunch::m_callback = nullptr; + +void DetectorCuLaunch::installHooks() +{ + CudaHook::instance().pre().cuFuncSetBlockShape = [](auto f, auto x, auto y, auto z) { + localInstance().onCuFuncSetBlockShape(f, x, y, z); + return 0; + }; + CudaHook::instance().pre().cuFuncSetSharedSize = [](auto f, auto size) { + localInstance().onCuFuncSetSharedSize(f, size); + return 0; + }; + CudaHook::instance().pre().cuLaunch = [](auto f) { + localInstance().onCuLaunch(f); + return 0; + }; + CudaHook::instance().pre().cuLaunchGrid = [](auto f, auto w, auto h) { + localInstance().onCuLaunchGrid(f, w, h); + return 0; + }; + CudaHook::instance().pre().cuLaunchGridAsync = [](auto f, auto w, auto h, auto stream) { + localInstance().onCuLaunchGridAsync(f, w, h, stream); + return 0; + }; +} + +DetectorCuLaunch &DetectorCuLaunch::localInstance() +{ + static thread_local DetectorCuLaunch detector; + return detector; +} + +void DetectorCuLaunch::onCuFuncSetBlockShape(void *f, int x, int y, int z) +{ + auto ¶ms = ensureParams(f); + params.blkX = x; + params.blkY = y; + params.blkZ = z; +} + +void DetectorCuLaunch::onCuFuncSetSharedSize(void *f, unsigned int bytes) +{ + auto ¶ms = ensureParams(f); + params.shdMem = bytes; +} + +void DetectorCuLaunch::onCuLaunch(void *func) +{ + auto ¶ms = ensureParams(func); + params.gridX = params.gridY = params.gridZ = 1; + fire(params); + m_params.erase(func); +} + +void DetectorCuLaunch::onCuLaunchGrid(void *f, int grid_width, int grid_height) +{ + auto ¶ms = ensureParams(f); + params.gridX = grid_width; + params.gridY = grid_height; + params.gridZ = 1; + fire(params); + m_params.erase(f); +} + +void DetectorCuLaunch::onCuLaunchGridAsync(void *f, int grid_width, int grid_height, void *stream) +{ + auto ¶ms = ensureParams(f); + params.gridX = grid_width; + params.gridY = grid_height; + params.gridZ = 1; + params.stream = stream; + fire(params); + m_params.erase(f); +} + +details::KernelParams &DetectorCuLaunch::ensureParams(void *func) +{ + auto it = m_params.try_emplace(func, details::KernelParams{1, 1, 1, 1, 1, 1, 0, nullptr}).first; + return it->second; +} + +void DetectorCuLaunch::fire(const details::KernelParams ¶ms) +{ + if (m_callback) { + m_callback(params.gridX, params.gridY, params.gridZ, + params.blkX, params.blkY, params.blkZ, + params.shdMem, params.stream); + } +} + +} // namespace salus diff --git a/src/cudahook/kernellaunches.h b/src/cudahook/kernellaunches.h new file mode 100644 index 0000000..772b6ca --- /dev/null +++ b/src/cudahook/kernellaunches.h @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2019, peifeng + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SALUS_KERNELLAUNCHES_H +#define SALUS_KERNELLAUNCHES_H + +#include +#include + +namespace salus { + +constexpr auto KernelLaunchCallbackFuncationName = "salus_kernel_launch_callback"; + +using FnKernelLaunchCallback = void (unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, + unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, + unsigned int sharedMemBytes, void *hStream); + +namespace details { + +struct KernelParams +{ + uint32_t gridX = 0; + uint32_t gridY = 0; + uint32_t gridZ = 0; + uint32_t blkX = 0; + uint32_t blkY = 0; + uint32_t blkZ = 0; + uint32_t shdMem = 0; + void *stream = nullptr; +}; + +} // namespace details + +class DetectorCuLaunchKernel +{ + static FnKernelLaunchCallback *m_callback; + +public: + static void setCallback(FnKernelLaunchCallback *callback) + { + m_callback = callback; + } + + static void installHooks(); + + static DetectorCuLaunchKernel &localInstance(); + + /* + * Actual detector logic below + */ +private: + enum class State { + Idle, Found + }; + State m_state = State::Idle; + + details::KernelParams m_kernelParams; + + void fire(); + +public: + DetectorCuLaunchKernel() = default; + + void onCuLaunchKernel(details::KernelParams params); +}; + +class DetectorCuLaunch +{ + static FnKernelLaunchCallback *m_callback; +public: + static void setCallback(FnKernelLaunchCallback *callback) + { + m_callback = callback; + } + + static void installHooks(); + + static DetectorCuLaunch &localInstance(); + + // Actual detector logic below +private: + enum class State { + Idle + }; + State m_state = State::Idle; + + std::unordered_map m_params; + + details::KernelParams &ensureParams(void *func); + + void fire(const details::KernelParams ¶ms); + +public: + void onCuFuncSetBlockShape(void* f, int x, int y, int z); + void onCuFuncSetSharedSize(void* f, unsigned int bytes); + void onCuLaunch(void *func); + void onCuLaunchGrid(void* f, int grid_width, int grid_height); + void onCuLaunchGridAsync(void* f, int grid_width, int grid_height, void* stream); +}; + +class KernelLaunches +{ + void *m_selfHandle = nullptr; + FnKernelLaunchCallback *m_kernelLaunchCallback = nullptr; + + bool m_debugging = false; + +public: + KernelLaunches() noexcept; + + bool debugging() const { return m_debugging; } +}; + +} // namespace salus + +#endif // SALUS_KERNELLAUNCHES_H diff --git a/src/cudahook/realdlsym.cpp b/src/cudahook/realdlsym.cpp new file mode 100644 index 0000000..1a1b21c --- /dev/null +++ b/src/cudahook/realdlsym.cpp @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2019, peifeng + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "realdlsym.h" + +#include + +extern "C" { +// For interposing dlsym(). See elf/dl-libc.c for the internal dlsym interface function +void* __libc_dlsym (void *map, const char *name); +} + +namespace salus { + +using FnDlsym = void *(void*, const char*); +void* real_dlsym(void *handle, const char* symbol) noexcept +{ + static auto internal_dlsym = func_cast(__libc_dlsym(dlopen("libdl.so.2", RTLD_LAZY), "dlsym")); + return (*internal_dlsym)(handle, symbol); +} + +} // namespace salus diff --git a/src/cudahook/realdlsym.h b/src/cudahook/realdlsym.h new file mode 100644 index 0000000..3e87c24 --- /dev/null +++ b/src/cudahook/realdlsym.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2019, peifeng + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SALUS_CUDAHOOK_REALDLSYM_H +#define SALUS_CUDAHOOK_REALDLSYM_H + +#include + +namespace salus { + +void *real_dlsym(void *handle, const char *symbol) noexcept; + +template +constexpr auto func_cast(void *ptr) noexcept { + return reinterpret_cast(reinterpret_cast(ptr)); +} + +} // namespace salus + +#endif // SALUS_CUDAHOOK_REALDLSYM_H diff --git a/src/execution/engine/taskexecutor.cpp b/src/execution/engine/taskexecutor.cpp index 13faf31..e8ad0bc 100644 --- a/src/execution/engine/taskexecutor.cpp +++ b/src/execution/engine/taskexecutor.cpp @@ -27,6 +27,7 @@ #include "execution/scheduler/basescheduler.h" #include "execution/scheduler/operationitem.h" #include "utils/date.h" +#include "platform/thread_annotations.h" using std::chrono::duration_cast; using std::chrono::microseconds; @@ -134,6 +135,8 @@ void TaskExecutor::queueTask(POpItem &&opItem) void TaskExecutor::scheduleLoop() { + threading::set_thread_name("TaskExecutor"); + auto scheduler = SchedulerRegistary::instance().create(m_schedParam.scheduler, *this); DCHECK(scheduler); VLOG(2) << "Using scheduler: " << scheduler; diff --git a/src/execution/executionengine.cpp b/src/execution/executionengine.cpp index 2041fd4..28b8f5d 100644 --- a/src/execution/executionengine.cpp +++ b/src/execution/executionengine.cpp @@ -25,6 +25,7 @@ #include "execution/engine/resourcecontext.h" #include "execution/iterationtask.h" #include "platform/logging.h" +#include "platform/thread_annotations.h" #include "utils/containerutils.h" #include "utils/date.h" #include "utils/debugging.h" @@ -125,6 +126,7 @@ void ExecutionEngine::maybeWaitForWork(size_t pending, size_t scheduled) void ExecutionEngine::scheduleLoop() { LOG(INFO) << "ExecutionEngine scheduling thread started"; + threading::set_thread_name("ExecutionEngine"); // a map of lane id to thread local queues. std::unordered_map queues; diff --git a/src/execution/threadpool/nonblockingthreadpool.cpp b/src/execution/threadpool/nonblockingthreadpool.cpp index 43f53a4..57143ea 100644 --- a/src/execution/threadpool/nonblockingthreadpool.cpp +++ b/src/execution/threadpool/nonblockingthreadpool.cpp @@ -26,6 +26,7 @@ #include "EventCount.h" #include "utils/fixed_function.hpp" #include "RunQueue.h" +#include "platform/thread_annotations.h" #include #include @@ -329,6 +330,8 @@ int ThreadPoolPrivate::nonEmptyQueueIndex() void ThreadPoolPrivate::workerLoop(int thread_id) { + salus::threading::set_thread_name("ThreadPoolWorker"); + const auto numThreads = m_options.numThreads; const auto spinCount = m_options.spinCount; const auto allowSpinning = m_options.allowSpinning; diff --git a/src/oplibraries/tensorflow/tfinstance.cpp b/src/oplibraries/tensorflow/tfinstance.cpp index eef26de..1b667eb 100644 --- a/src/oplibraries/tensorflow/tfinstance.cpp +++ b/src/oplibraries/tensorflow/tfinstance.cpp @@ -156,6 +156,7 @@ void TFInstance::handleCreateSession(std::unique_ptr & // NOTE: laneId on ectx is separated from actual lane implementation. // It is only used to have separate scheduling domain. So use first lane's id as the id // Revisit if later multi-lane for a job is implemented. + // TODO: support multiple lane id ectx->setLaneId(lanes.at(0)->id()); auto session = diff --git a/src/oplibraries/tensorflow/v3/smblocker.cpp b/src/oplibraries/tensorflow/v3/smblocker.cpp new file mode 100644 index 0000000..b369379 --- /dev/null +++ b/src/oplibraries/tensorflow/v3/smblocker.cpp @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2019, peifeng + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "oplibraries/tensorflow/tensorflow_headers.h" +#include "oplibraries/tensorflow/v3/smblocker.h" +#include "utils/threadutils.h" +#include "utils/containerutils.h" + +#include + +namespace { + +struct SalusCudaKernelLaunchParams +{ + struct Vec3 + { + uint64_t x; + uint64_t y; + uint64_t z; + }; + Vec3 blockCount; + Vec3 threadPerBlock; + uint64_t sharedMemBytes; +}; + +thread_local std::vector SavedCudaKernelLaunches{}; +thread_local uint64_t CurrentThreadHoldingBlocks = 0; + +inline auto max(uint64_t a, SalusCudaKernelLaunchParams::Vec3 vec) +{ + auto b = vec.x * vec.y * vec.z; + return std::max(a, b); +} + +} // namespace + +extern "C" { + +void salus_kernel_launch_callback(unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, + unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, + unsigned int sharedMemBytes, + void *) +{ + SavedCudaKernelLaunches.push_back(SalusCudaKernelLaunchParams{ + {gridDimX, gridDimY, gridDimZ}, + {blockDimX, blockDimY, blockDimZ}, + sharedMemBytes, + }); + LOG(DEBUG) << "Got kernel launch params: (" + << gridDimX << "," << gridDimY << "," << gridDimZ + << ") x (" << blockDimX << "," << blockDimY << "," << blockDimZ << "), " << sharedMemBytes; +} + +} // extern "C" + +namespace salus::oplib::tensorflow { + +SMBlocker &SMBlocker::instance() +{ + static SMBlocker blocker; + return blocker; +} + +SMUsage SMBlocker::queryAvailableSM() +{ + auto gpu_manager = tf::GPUMachineManager(); + // TODO: assume each device has the same number of SM + auto se = gpu_manager->ExecutorForDevice(0).ValueOrDie(); + return { + se->GetDeviceDescription().threads_per_block_limit(), + static_cast(se->GetDeviceDescription().core_count()) + }; +} + +SMBlocker::SMBlocker() + : m_maxUsage{queryAvailableSM()} + , m_freeBlocks(m_maxUsage.blockCount) +{ +} + +void SMBlocker::saveCurrentThreadResults(uint64_t graphId, int nodeId) +{ + // release blocks first + { + m_freeBlocks.notify(CurrentThreadHoldingBlocks); + CurrentThreadHoldingBlocks = 0; + } + + // update cache + std::unique_lock l{m_mu}; + + auto &usage = m_cache[std::make_pair(graphId, nodeId)]; + + if (usage.blockCount != 0 || usage.threadPerBlock != 0) { + LOG(WARNING) << "Overriding SM usage for graph " << graphId << " node " << nodeId + << ", previous: " << usage.blockCount << " " << usage.threadPerBlock; + } + + for (const auto &res : SavedCudaKernelLaunches) { + usage.threadPerBlock = max(usage.threadPerBlock, res.threadPerBlock); + usage.blockCount = max(usage.blockCount, res.blockCount); + } + SavedCudaKernelLaunches.clear(); +} + +bool SMBlocker::maybeBlock(uint64_t graphId, int nodeId) +{ + auto smUsage = getUsageForKernel(graphId, nodeId); + + return m_freeBlocks.may_block(smUsage); +} + +void SMBlocker::wait(uint64_t graphId, int nodeId) +{ + auto smUsage = getUsageForKernel(graphId, nodeId); + + // save the count + CurrentThreadHoldingBlocks = smUsage; + + LOG(DEBUG) << "Wait at SMBlocker: graph " << graphId << " node " << nodeId; + m_freeBlocks.wait(smUsage); + LOG(DEBUG) << "Passed at SMBlocker: graph " << graphId << " node " << nodeId; +} + +uint64_t SMBlocker::getUsageForKernel(uint64_t graphId, int nodeId) +{ + std::shared_lock l{m_mu}; + + auto usage = sstl::getOrDefault(m_cache, {graphId, nodeId}, {}); + + return std::min(usage.blockCount, m_maxUsage.blockCount); +} + +} // namespace salus::oplib::tensorflow diff --git a/src/oplibraries/tensorflow/v3/smblocker.h b/src/oplibraries/tensorflow/v3/smblocker.h new file mode 100644 index 0000000..003afa4 --- /dev/null +++ b/src/oplibraries/tensorflow/v3/smblocker.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2019, peifeng + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SALUS_OPLIB_TENSORFLOW_SMBLOCKER_H +#define SALUS_OPLIB_TENSORFLOW_SMBLOCKER_H + +#include "oplibraries/tensorflow/tensorflow_headers.h" + +#include "utils/threadutils.h" + +#include + +#include +#include +#include + +namespace salus::oplib::tensorflow { +struct SMUsage +{ + uint64_t threadPerBlock = 0; + uint64_t blockCount = 0; +}; + +class SMBlocker +{ +public: + static SMBlocker &instance(); + + void saveCurrentThreadResults(uint64_t graphId, int nodeId); + + bool maybeBlock(uint64_t graphId, int nodeId); + void wait(uint64_t graphId, int nodeId); + +private: + static SMUsage queryAvailableSM(); + + SMBlocker(); + + uint64_t getUsageForKernel(uint64_t graphId, int nodeId); + + const SMUsage m_maxUsage; + + sstl::semaphore m_freeBlocks; + + using KernelId = std::pair; + std::unordered_map> m_cache; + std::shared_mutex m_mu; +}; + +} // namespace salus::oplib::tensorflow + +#endif // SALUS_OPLIB_TENSORFLOW_SMBLOCKER_H diff --git a/src/oplibraries/tensorflow/v3/tf_executor.cpp b/src/oplibraries/tensorflow/v3/tf_executor.cpp index dafba69..cf0fcea 100644 --- a/src/oplibraries/tensorflow/v3/tf_executor.cpp +++ b/src/oplibraries/tensorflow/v3/tf_executor.cpp @@ -19,6 +19,7 @@ limitations under the License. #include "execution/engine/iterationcontext.h" #include "execution/iterationtask.h" #include "oplibraries/tensorflow/tfinstance.h" +#include "oplibraries/tensorflow/v3/smblocker.h" #include "utils/envutils.h" namespace salus::oplib::tensorflow { @@ -1484,7 +1485,14 @@ void ExecutorState::Process(TaggedNode tagged_node, tf::int64) inline_ready.push_back(tagged_node); while (!inline_ready.empty()) { tagged_node = inline_ready.front(); + + if (SMBlocker::instance().maybeBlock(impl_->graph_id_, tagged_node.node->id())) { + continue; + } + SMBlocker::instance().wait(impl_->graph_id_, tagged_node.node->id()); + inline_ready.pop_front(); + const auto *node = tagged_node.node; FrameState *input_frame = tagged_node.input_frame; const tf::int64 input_iter = tagged_node.input_iter; @@ -1564,6 +1572,8 @@ void ExecutorState::Process(TaggedNode tagged_node, tf::int64) AsyncState *state = new AsyncState(params, tagged_node, &item, first_input, nullptr); auto done = [this, state]() { + SMBlocker::instance().saveCurrentThreadResults(impl_->graph_id_, state->item->node->id()); + auto *device = impl_->params_.device; Entry *first_input = state->first_input; // Shorthand @@ -1606,6 +1616,9 @@ void ExecutorState::Process(TaggedNode tagged_node, tf::int64) tf::OpKernelContext ctx(¶ms, item.num_outputs); CHECK_NOTNULL(op_kernel); device->Compute(op_kernel, &ctx); + + SMBlocker::instance().saveCurrentThreadResults(impl_->graph_id_, item.node->id()); + s = ProcessOutputs(item, &ctx, &outputs, nullptr); if (s.ok() && impl_->device_record_tensor_accesses_) { // Get the list of all tensors accessed during the execution diff --git a/src/platform/CMakeLists.txt b/src/platform/CMakeLists.txt index 088cc12..5f9cb1f 100644 --- a/src/platform/CMakeLists.txt +++ b/src/platform/CMakeLists.txt @@ -8,10 +8,11 @@ if(WIN32) "windows/memory.cpp" "windows/signals.cpp" ) -else(WIN32) +else() # POSIX list(APPEND SRC_LIST "posix/memory.cpp" "posix/signals.cpp" + "posix/thread_annotations.cpp" ) endif(WIN32) @@ -22,4 +23,13 @@ target_link_libraries(platform PRIVATE protos_gen + Threads::Threads ) + +if(NOT WIN32) + # POSIX + target_compile_definitions(platform + PRIVATE + _GNU_SOURCE=1 + ) +endif() diff --git a/src/platform/posix/thread_annotations.cpp b/src/platform/posix/thread_annotations.cpp new file mode 100644 index 0000000..3ce20a8 --- /dev/null +++ b/src/platform/posix/thread_annotations.cpp @@ -0,0 +1,40 @@ +/* + * Copyright 2019 Peifeng Yu + * + * This file is part of Salus + * (see https://github.com/SymbioticLab/Salus). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "platform/thread_annotations.h" + +#include + +#include + +namespace salus::threading { + +void set_thread_name(std::string_view name) +{ +#if defined(__GLIBC__) + // maximum length is 16 (including \0), longer than that casues error + constexpr auto MAX_THREAD_NAME_LENGTH = 16 - 1; + pthread_setname_np(pthread_self(), std::string(name.substr(0, MAX_THREAD_NAME_LENGTH)).c_str()); +#elif defined(__APPLE__) + pthread_setname_np(std::string(name).c_str()); +#else +#error unsupported platform for POSIX! +#endif +} + +} // namespace salus::threading diff --git a/src/platform/thread_annotations.h b/src/platform/thread_annotations.h index a4d6414..8db0e77 100644 --- a/src/platform/thread_annotations.h +++ b/src/platform/thread_annotations.h @@ -155,4 +155,11 @@ inline T &ts_unchecked_read(T &v) NO_THREAD_SAFETY_ANALYSIS } } // namespace salus::thread_safety_analysis +#include +namespace salus::threading { + +void set_thread_name(std::string_view name); + +} // namespace salus::threading + #endif // SALUS_PLATFORM_THREAD_ANNOTATIONS_H_ diff --git a/src/rpcserver/iothreadpool.cpp b/src/rpcserver/iothreadpool.cpp index 4bb08be..7388c69 100644 --- a/src/rpcserver/iothreadpool.cpp +++ b/src/rpcserver/iothreadpool.cpp @@ -18,6 +18,7 @@ */ #include "iothreadpool.h" +#include "platform/thread_annotations.h" #include @@ -42,6 +43,7 @@ IOThreadPoolImpl::~IOThreadPoolImpl() void IOThreadPoolImpl::workerLoop() { + threading::set_thread_name("salus::IOThreadPoolWorker"); m_context.run(); } diff --git a/src/salus-server.list b/src/salus-server.list new file mode 100644 index 0000000..f7e71f2 --- /dev/null +++ b/src/salus-server.list @@ -0,0 +1,3 @@ +{ + salus_kernel_launch_callback; +}; diff --git a/src/utils/threadutils.cpp b/src/utils/threadutils.cpp index 6b7bf6f..c368b9d 100644 --- a/src/utils/threadutils.cpp +++ b/src/utils/threadutils.cpp @@ -21,7 +21,7 @@ namespace sstl { -void semaphore::notify(uint32_t c) +void semaphore::notify(uint64_t c) { { auto l = with_guard(m_mu); @@ -31,7 +31,13 @@ void semaphore::notify(uint32_t c) m_cv.notify_all(); } -void semaphore::wait(uint32_t c) +bool semaphore::may_block(uint64_t c) +{ + auto lock = with_guard(m_mu); + return m_count < c; +} + +void semaphore::wait(uint64_t c) { auto lock = with_uguard(m_mu); m_cv.wait(lock, [&]() { return m_count >= c; }); diff --git a/src/utils/threadutils.h b/src/utils/threadutils.h index 87a43ea..c267220 100644 --- a/src/utils/threadutils.h +++ b/src/utils/threadutils.h @@ -163,12 +163,17 @@ class semaphore { std::mutex m_mu; std::condition_variable m_cv; - uint32_t m_count = 0; // Initialized as locked. + uint64_t m_count = 0; public: - void notify(uint32_t c = 1); + // Initialized as locked. + explicit semaphore(uint64_t init = 0) : m_count(init) {} - void wait(uint32_t c = 1); + void notify(uint64_t c = 1); + + void wait(uint64_t c = 1); + + bool may_block(uint64_t c = 1); }; /** From e7f4f5180984848728456e91df5a70f10212b368 Mon Sep 17 00:00:00 2001 From: Peifeng Yu Date: Thu, 4 Apr 2019 23:46:27 -0400 Subject: [PATCH 04/13] Fix unittest compability with py3 --- src/oplibraries/tensorflow/v3/smblocker.cpp | 19 +++++++++++-------- tests/test_tf/lib/datasets/ptb_reader.py | 2 +- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/oplibraries/tensorflow/v3/smblocker.cpp b/src/oplibraries/tensorflow/v3/smblocker.cpp index b369379..1978099 100644 --- a/src/oplibraries/tensorflow/v3/smblocker.cpp +++ b/src/oplibraries/tensorflow/v3/smblocker.cpp @@ -69,9 +69,9 @@ void salus_kernel_launch_callback(unsigned int gridDimX, unsigned int gridDimY, {blockDimX, blockDimY, blockDimZ}, sharedMemBytes, }); - LOG(DEBUG) << "Got kernel launch params: (" + LOG(DEBUG) << "Got kernel launch params: blk=(" << gridDimX << "," << gridDimY << "," << gridDimZ - << ") x (" << blockDimX << "," << blockDimY << "," << blockDimZ << "), " << sharedMemBytes; + << ") x thd=(" << blockDimX << "," << blockDimY << "," << blockDimZ << "), " << sharedMemBytes; } } // extern "C" @@ -112,17 +112,20 @@ void SMBlocker::saveCurrentThreadResults(uint64_t graphId, int nodeId) // update cache std::unique_lock l{m_mu}; - auto &usage = m_cache[std::make_pair(graphId, nodeId)]; + SMUsage newUsage{0, 0}; + for (const auto &res : SavedCudaKernelLaunches) { + newUsage.threadPerBlock = max(newUsage.threadPerBlock, res.threadPerBlock); + newUsage.blockCount = max(newUsage.blockCount, res.blockCount); + } + auto &usage = m_cache[std::make_pair(graphId, nodeId)]; if (usage.blockCount != 0 || usage.threadPerBlock != 0) { LOG(WARNING) << "Overriding SM usage for graph " << graphId << " node " << nodeId - << ", previous: " << usage.blockCount << " " << usage.threadPerBlock; + << ", previous: blk=" << usage.blockCount << " thd=" << usage.threadPerBlock + << ", new: blk=" << newUsage.blockCount << " thd=" << newUsage.threadPerBlock; } + usage = newUsage; - for (const auto &res : SavedCudaKernelLaunches) { - usage.threadPerBlock = max(usage.threadPerBlock, res.threadPerBlock); - usage.blockCount = max(usage.blockCount, res.blockCount); - } SavedCudaKernelLaunches.clear(); } diff --git a/tests/test_tf/lib/datasets/ptb_reader.py b/tests/test_tf/lib/datasets/ptb_reader.py index a1b7609..8f1874c 100644 --- a/tests/test_tf/lib/datasets/ptb_reader.py +++ b/tests/test_tf/lib/datasets/ptb_reader.py @@ -37,7 +37,7 @@ def __init__(self, config, data, name=None): def _read_words(filename): - with tf.gfile.GFile(filename, "r") as f: + with tf.gfile.GFile(filename, "rb") as f: return f.read().decode("utf-8").replace("\n", "").split() From 70cca56f33bdba23f928cd9d30a7676969c44849 Mon Sep 17 00:00:00 2001 From: Peifeng Yu Date: Thu, 4 Apr 2019 23:47:06 -0400 Subject: [PATCH 05/13] New card308 to save SavedModel --- benchmarks/driver/__main__.py | 2 +- benchmarks/driver/runner.py | 16 +++++++- benchmarks/exps/__init__.py | 72 +++++++++++++++++++++++++++++++++-- benchmarks/exps/card308.py | 64 +++++++++++++++++++++++++++++++ 4 files changed, 148 insertions(+), 6 deletions(-) create mode 100644 benchmarks/exps/card308.py diff --git a/benchmarks/driver/__main__.py b/benchmarks/driver/__main__.py index f2bbc16..a6ac7cb 100644 --- a/benchmarks/driver/__main__.py +++ b/benchmarks/driver/__main__.py @@ -156,7 +156,7 @@ def parse_expname(args): def main(): - # type: (Sequence[str]) -> None + # type: () -> None # find first argument not starting with dash exp, argv = parse_expname(sys.argv) diff --git a/benchmarks/driver/runner.py b/benchmarks/driver/runner.py index ea8e5e1..df09359 100644 --- a/benchmarks/driver/runner.py +++ b/benchmarks/driver/runner.py @@ -116,23 +116,35 @@ def __call__(self, executor, output_file): eval_interval = self.wl.env.pop('SALUS_TFBENCH_EVAL_INTERVAL', '0.1') eval_rand_factor = self.wl.env.pop('SALUS_TFBENCH_EVAL_RAND_FACTOR', '5') eval_block = self.wl.env.pop('SALUS_TFBENCH_EVAL_BLOCK', 'true') + + eval_model_dir = self.wl.env.pop('SALUS_TFBENCH_EVAL_MODEL_DIR', 'models') + eval_model_dir = str(Path(eval_model_dir).joinpath(self.wl.name.rstrip('eval'))) + + eval_saved_model_dir = self.wl.env.pop('SALUS_TFBENCH_EVAL_SAVED_MODEL_DIR', None) + if eval_saved_model_dir is not None: + eval_saved_model_dir = str(Path(eval_saved_model_dir).joinpath(self.wl.name.rstrip('eval'))) + if self.wl.name.endswith('eval'): model_name = self.wl.name.rsplit('eval')[0] cmd += [ - '--model_dir=models/{}'.format(model_name), + '--model_dir=' + eval_model_dir, '--model={}'.format(model_name), '--eval_interval_secs={}'.format(eval_interval), '--eval_interval_random_factor={}'.format(eval_rand_factor), '--eval_block={}'.format(eval_block), '--eval' ] + if eval_saved_model_dir is not None: + cmd += [ + '--saved_model_dir=' + eval_saved_model_dir + ] else: cmd += [ '--model={}'.format(self.wl.name), ] if str2bool(self.wl.env.pop('SALUS_SAVE_MODEL', '')): cmd += [ - '--model_dir=models/{}'.format(self.wl.name), + '--model_dir=' + eval_model_dir, ] if FLAGS.no_capture: diff --git a/benchmarks/exps/__init__.py b/benchmarks/exps/__init__.py index 217c5b2..2ba3001 100644 --- a/benchmarks/exps/__init__.py +++ b/benchmarks/exps/__init__.py @@ -19,23 +19,25 @@ # from __future__ import absolute_import, print_function, division, unicode_literals +import itertools import time import re import logging from absl import flags -from typing import Union, Iterable, List, TypeVar, Callable +from typing import Union, Iterable, List, TypeVar, Callable, Optional import benchmarks.driver.utils.prompt as prompt from benchmarks.driver.runner import Executor from benchmarks.driver.server.config import presets from benchmarks.driver.server import SalusServer, SalusConfig from benchmarks.driver.tfserver import TFDistServer -from benchmarks.driver.utils import atomic_directory, try_with_default, UsageError, kill_tree +from benchmarks.driver.utils import atomic_directory, try_with_default, UsageError, kill_tree, unique from benchmarks.driver.utils.compatiblity import pathlib from benchmarks.driver.workload import Workload, WTL, ResourceGeometry Path = pathlib.Path T = TypeVar('T') +TBatchSize = Union[str, int] logger = logging.getLogger(__name__) FLAGS = flags.FLAGS @@ -70,7 +72,7 @@ class RunFn(object): __slots__ = '_fn' def __init__(self, fn): - # type: (Callable[Iterable[Workload], None]) -> None + # type: (Callable[[Iterable[Workload], *str], None]) -> None self._fn = fn def run(self, workloads, **kwargs): @@ -290,3 +292,67 @@ def update_jct(workload, update_global=False): workload.geometry.jct = jct if update_global: WTL.from_name(workload.name).add_geometry(workload.rcfg, workload.executor, ResourceGeometry(jct=jct)) + + +def select_workloads(argv, # type: Iterable[str] + batch_size=None, # type: Optional[Union[Iterable[TBatchSize], TBatchSize]] + batch_num=None, # type: Optional[Union[Iterable[int], int]] + executor=None # type: Optional[Union[Iterable[Executor], Executor]] + ): + # type: (...) -> Iterable[Workload] + """Select workloads based on commandline + Workloads can be separated by comma (',') or space (' '). + The workload name can include a undersocre ('_') separated batch size. + + If no batch size part is included, batch sizes given in argument are selected. + + If argv is empty, all available workloads are selected. + + batch_size, batch_num, executor arguments expects a list of possible values, single value are converted into list. + + Returns: list of created Workload instance + + Example: alexnet_25,vgg11 inception3_75 + """ + if batch_size is not None: + if not isinstance(batch_size, list): + batch_size = [batch_size] + + if batch_num is None: + batch_num = [1] + else: + if not isinstance(batch_num, list): + batch_num = [batch_num] + + if executor is None: + executor = [Executor.Salus] + else: + if not isinstance(executor, list): + executor = [executor] + + if not argv: + names = WTL.known_workloads.keys() + else: + names = unique(( + name + for piece in argv + for name in piece.split(',') + ), stable=True) + + def expandbs(name): + if '_' in name: + name, bs = name.split('_') + return [(name, int(bs))] + else: + avail = WTL.from_name(name).available_batch_sizes() + if batch_size is None: + bss = avail + else: + bss = [bs for bs in batch_size if bs in avail] + return zip([name] * len(bss), bss) + + wls = [name_bs for name in names for name_bs in expandbs(name)] + + wls = itertools.product(wls, batch_num, executor) + + return [WTL.create(name, bs, bn, ex) for (name, bs), bn, ex in wls] diff --git a/benchmarks/exps/card308.py b/benchmarks/exps/card308.py new file mode 100644 index 0000000..4b656f2 --- /dev/null +++ b/benchmarks/exps/card308.py @@ -0,0 +1,64 @@ +# -*- coding: future_fstrings -*- +# +# Copyright 2019 Peifeng Yu +# +# This file is part of Salus +# (see https://github.com/SymbioticLab/Salus). +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +""" +Card 308: Make inference job accepts requests using tfweb + +Export SavedModel for existing tf_cnn_benchmark models + +Collected data: SavedModel +""" +from __future__ import absolute_import, print_function, division, unicode_literals + +from absl import flags +from typing import Sequence +import logging + +from benchmarks.driver.runner import TFBenchmarkRunner +from benchmarks.driver.utils.compatiblity import pathlib +from benchmarks.driver.workload import Executor +from benchmarks.exps import run_tf, select_workloads + +FLAGS = flags.FLAGS +logger = logging.getLogger(__name__) + + +def main(argv): + # type: (Sequence[str]) -> None + + model_dir = pathlib.Path('~/../symbiotic/peifeng/tf_cnn_benchmarks_models/legacy_checkpoint_models') + model_dir = model_dir.expanduser().resolve() + + saved_model_dir = pathlib.Path('~/../symbiotic/peifeng/tf_cnn_benchmarks_models/saved_models') + saved_model_dir = saved_model_dir.expanduser().resolve() + + for wl in select_workloads(argv, batch_size=1, batch_num=1, executor=Executor.TF): + if wl.wtl.runnerCls is not TFBenchmarkRunner: + logger.info(f'Skipping {wl.name}') + continue + if not wl.name.endswith('eval'): + logger.info(f'Skipping {wl.name}') + continue + + logger.info(f"**** Saving SavedModel: {wl.canonical_name}") + logger.info(f"**** Location: {FLAGS.save_dir}") + + wl.env['SALUS_TFBENCH_EVAL_MODEL_DIR'] = str(model_dir) + wl.env['SALUS_TFBENCH_EVAL_SAVED_MODEL_DIR'] = str(saved_model_dir) + run_tf(FLAGS.save_dir, wl) From faa64f3fba2985ab9e1f496a1ae693c35011e28f Mon Sep 17 00:00:00 2001 From: Peifeng Yu Date: Thu, 4 Apr 2019 23:47:33 -0400 Subject: [PATCH 06/13] Move old exp --- benchmarks/exps/old/__init__.py | 0 benchmarks/exps/{ => old}/bigrun.py | 0 benchmarks/exps/{ => old}/card13.py | 0 benchmarks/exps/{ => old}/card185.py | 0 benchmarks/exps/{ => old}/card186.py | 0 benchmarks/exps/{ => old}/card188.py | 0 benchmarks/exps/{ => old}/card189.py | 0 benchmarks/exps/{ => old}/card194.py | 0 benchmarks/exps/{ => old}/card195.py | 0 benchmarks/exps/{ => old}/card198.py | 0 benchmarks/exps/{ => old}/card203.py | 0 benchmarks/exps/{ => old}/card206.py | 0 benchmarks/exps/{ => old}/card207.py | 0 benchmarks/exps/{ => old}/card210.py | 0 benchmarks/exps/{ => old}/card211.py | 0 benchmarks/exps/{ => old}/card212.py | 0 benchmarks/exps/{ => old}/card214.py | 0 benchmarks/exps/{ => old}/card218.py | 0 benchmarks/exps/{ => old}/card219.py | 0 benchmarks/exps/{ => old}/card233.py | 0 benchmarks/exps/{ => old}/card234.py | 0 benchmarks/exps/{ => old}/card235.py | 0 benchmarks/exps/{ => old}/card236.py | 0 benchmarks/exps/{ => old}/card240.py | 0 benchmarks/exps/{ => old}/card241.py | 0 benchmarks/exps/{ => old}/card249.py | 0 benchmarks/exps/{ => old}/card250.py | 0 benchmarks/exps/{ => old}/card251.py | 0 benchmarks/exps/{ => old}/card252.py | 0 benchmarks/exps/{ => old}/card259.py | 0 benchmarks/exps/{ => old}/card262.py | 0 benchmarks/exps/{ => old}/card266.py | 0 benchmarks/exps/{ => old}/card270.py | 0 benchmarks/exps/{ => old}/card271.py | 0 benchmarks/exps/{ => old}/card272.py | 0 benchmarks/exps/{ => old}/card275.py | 0 benchmarks/exps/{ => old}/exp10.py | 0 benchmarks/exps/{ => old}/exp11.py | 0 benchmarks/exps/{ => old}/exp12.py | 0 benchmarks/exps/{ => old}/exp13.py | 0 benchmarks/exps/{ => old}/exp14.py | 0 benchmarks/exps/{ => old}/exp15.py | 0 benchmarks/exps/{ => old}/exp18.py | 0 benchmarks/exps/{ => old}/exp19.py | 0 benchmarks/exps/{ => old}/exp3.py | 0 benchmarks/exps/{ => old}/exp6_2.py | 0 benchmarks/exps/{ => old}/exp9.py | 0 benchmarks/exps/{ => old}/fairnessjct.py | 0 benchmarks/exps/{ => old}/gperf.py | 0 benchmarks/exps/{ => old}/jct.py | 0 benchmarks/exps/{ => old}/mem.py | 0 benchmarks/exps/{ => old}/memop.py | 0 benchmarks/exps/{ => old}/mnist.py | 0 benchmarks/exps/{ => old}/mnistnv.py | 0 benchmarks/exps/{ => old}/one.py | 0 benchmarks/exps/{ => old}/onetf.py | 0 benchmarks/exps/{ => old}/optracing.py | 0 benchmarks/exps/{ => old}/paral.py | 0 benchmarks/exps/{ => old}/paral2.py | 0 benchmarks/exps/{ => old}/paraltf.py | 0 benchmarks/exps/{ => old}/run5.py | 0 benchmarks/exps/{ => old}/save_train_dir.py | 14 ++++++-------- benchmarks/exps/{ => old}/serialmany.py | 0 benchmarks/exps/{ => old}/two.py | 0 64 files changed, 6 insertions(+), 8 deletions(-) create mode 100644 benchmarks/exps/old/__init__.py rename benchmarks/exps/{ => old}/bigrun.py (100%) rename benchmarks/exps/{ => old}/card13.py (100%) rename benchmarks/exps/{ => old}/card185.py (100%) rename benchmarks/exps/{ => old}/card186.py (100%) rename benchmarks/exps/{ => old}/card188.py (100%) rename benchmarks/exps/{ => old}/card189.py (100%) rename benchmarks/exps/{ => old}/card194.py (100%) rename benchmarks/exps/{ => old}/card195.py (100%) rename benchmarks/exps/{ => old}/card198.py (100%) rename benchmarks/exps/{ => old}/card203.py (100%) rename benchmarks/exps/{ => old}/card206.py (100%) rename benchmarks/exps/{ => old}/card207.py (100%) rename benchmarks/exps/{ => old}/card210.py (100%) rename benchmarks/exps/{ => old}/card211.py (100%) rename benchmarks/exps/{ => old}/card212.py (100%) rename benchmarks/exps/{ => old}/card214.py (100%) rename benchmarks/exps/{ => old}/card218.py (100%) rename benchmarks/exps/{ => old}/card219.py (100%) rename benchmarks/exps/{ => old}/card233.py (100%) rename benchmarks/exps/{ => old}/card234.py (100%) rename benchmarks/exps/{ => old}/card235.py (100%) rename benchmarks/exps/{ => old}/card236.py (100%) rename benchmarks/exps/{ => old}/card240.py (100%) rename benchmarks/exps/{ => old}/card241.py (100%) rename benchmarks/exps/{ => old}/card249.py (100%) rename benchmarks/exps/{ => old}/card250.py (100%) rename benchmarks/exps/{ => old}/card251.py (100%) rename benchmarks/exps/{ => old}/card252.py (100%) rename benchmarks/exps/{ => old}/card259.py (100%) rename benchmarks/exps/{ => old}/card262.py (100%) rename benchmarks/exps/{ => old}/card266.py (100%) rename benchmarks/exps/{ => old}/card270.py (100%) rename benchmarks/exps/{ => old}/card271.py (100%) rename benchmarks/exps/{ => old}/card272.py (100%) rename benchmarks/exps/{ => old}/card275.py (100%) rename benchmarks/exps/{ => old}/exp10.py (100%) rename benchmarks/exps/{ => old}/exp11.py (100%) rename benchmarks/exps/{ => old}/exp12.py (100%) rename benchmarks/exps/{ => old}/exp13.py (100%) rename benchmarks/exps/{ => old}/exp14.py (100%) rename benchmarks/exps/{ => old}/exp15.py (100%) rename benchmarks/exps/{ => old}/exp18.py (100%) rename benchmarks/exps/{ => old}/exp19.py (100%) rename benchmarks/exps/{ => old}/exp3.py (100%) rename benchmarks/exps/{ => old}/exp6_2.py (100%) rename benchmarks/exps/{ => old}/exp9.py (100%) rename benchmarks/exps/{ => old}/fairnessjct.py (100%) rename benchmarks/exps/{ => old}/gperf.py (100%) rename benchmarks/exps/{ => old}/jct.py (100%) rename benchmarks/exps/{ => old}/mem.py (100%) rename benchmarks/exps/{ => old}/memop.py (100%) rename benchmarks/exps/{ => old}/mnist.py (100%) rename benchmarks/exps/{ => old}/mnistnv.py (100%) rename benchmarks/exps/{ => old}/one.py (100%) rename benchmarks/exps/{ => old}/onetf.py (100%) rename benchmarks/exps/{ => old}/optracing.py (100%) rename benchmarks/exps/{ => old}/paral.py (100%) rename benchmarks/exps/{ => old}/paral2.py (100%) rename benchmarks/exps/{ => old}/paraltf.py (100%) rename benchmarks/exps/{ => old}/run5.py (100%) rename benchmarks/exps/{ => old}/save_train_dir.py (87%) rename benchmarks/exps/{ => old}/serialmany.py (100%) rename benchmarks/exps/{ => old}/two.py (100%) diff --git a/benchmarks/exps/old/__init__.py b/benchmarks/exps/old/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/exps/bigrun.py b/benchmarks/exps/old/bigrun.py similarity index 100% rename from benchmarks/exps/bigrun.py rename to benchmarks/exps/old/bigrun.py diff --git a/benchmarks/exps/card13.py b/benchmarks/exps/old/card13.py similarity index 100% rename from benchmarks/exps/card13.py rename to benchmarks/exps/old/card13.py diff --git a/benchmarks/exps/card185.py b/benchmarks/exps/old/card185.py similarity index 100% rename from benchmarks/exps/card185.py rename to benchmarks/exps/old/card185.py diff --git a/benchmarks/exps/card186.py b/benchmarks/exps/old/card186.py similarity index 100% rename from benchmarks/exps/card186.py rename to benchmarks/exps/old/card186.py diff --git a/benchmarks/exps/card188.py b/benchmarks/exps/old/card188.py similarity index 100% rename from benchmarks/exps/card188.py rename to benchmarks/exps/old/card188.py diff --git a/benchmarks/exps/card189.py b/benchmarks/exps/old/card189.py similarity index 100% rename from benchmarks/exps/card189.py rename to benchmarks/exps/old/card189.py diff --git a/benchmarks/exps/card194.py b/benchmarks/exps/old/card194.py similarity index 100% rename from benchmarks/exps/card194.py rename to benchmarks/exps/old/card194.py diff --git a/benchmarks/exps/card195.py b/benchmarks/exps/old/card195.py similarity index 100% rename from benchmarks/exps/card195.py rename to benchmarks/exps/old/card195.py diff --git a/benchmarks/exps/card198.py b/benchmarks/exps/old/card198.py similarity index 100% rename from benchmarks/exps/card198.py rename to benchmarks/exps/old/card198.py diff --git a/benchmarks/exps/card203.py b/benchmarks/exps/old/card203.py similarity index 100% rename from benchmarks/exps/card203.py rename to benchmarks/exps/old/card203.py diff --git a/benchmarks/exps/card206.py b/benchmarks/exps/old/card206.py similarity index 100% rename from benchmarks/exps/card206.py rename to benchmarks/exps/old/card206.py diff --git a/benchmarks/exps/card207.py b/benchmarks/exps/old/card207.py similarity index 100% rename from benchmarks/exps/card207.py rename to benchmarks/exps/old/card207.py diff --git a/benchmarks/exps/card210.py b/benchmarks/exps/old/card210.py similarity index 100% rename from benchmarks/exps/card210.py rename to benchmarks/exps/old/card210.py diff --git a/benchmarks/exps/card211.py b/benchmarks/exps/old/card211.py similarity index 100% rename from benchmarks/exps/card211.py rename to benchmarks/exps/old/card211.py diff --git a/benchmarks/exps/card212.py b/benchmarks/exps/old/card212.py similarity index 100% rename from benchmarks/exps/card212.py rename to benchmarks/exps/old/card212.py diff --git a/benchmarks/exps/card214.py b/benchmarks/exps/old/card214.py similarity index 100% rename from benchmarks/exps/card214.py rename to benchmarks/exps/old/card214.py diff --git a/benchmarks/exps/card218.py b/benchmarks/exps/old/card218.py similarity index 100% rename from benchmarks/exps/card218.py rename to benchmarks/exps/old/card218.py diff --git a/benchmarks/exps/card219.py b/benchmarks/exps/old/card219.py similarity index 100% rename from benchmarks/exps/card219.py rename to benchmarks/exps/old/card219.py diff --git a/benchmarks/exps/card233.py b/benchmarks/exps/old/card233.py similarity index 100% rename from benchmarks/exps/card233.py rename to benchmarks/exps/old/card233.py diff --git a/benchmarks/exps/card234.py b/benchmarks/exps/old/card234.py similarity index 100% rename from benchmarks/exps/card234.py rename to benchmarks/exps/old/card234.py diff --git a/benchmarks/exps/card235.py b/benchmarks/exps/old/card235.py similarity index 100% rename from benchmarks/exps/card235.py rename to benchmarks/exps/old/card235.py diff --git a/benchmarks/exps/card236.py b/benchmarks/exps/old/card236.py similarity index 100% rename from benchmarks/exps/card236.py rename to benchmarks/exps/old/card236.py diff --git a/benchmarks/exps/card240.py b/benchmarks/exps/old/card240.py similarity index 100% rename from benchmarks/exps/card240.py rename to benchmarks/exps/old/card240.py diff --git a/benchmarks/exps/card241.py b/benchmarks/exps/old/card241.py similarity index 100% rename from benchmarks/exps/card241.py rename to benchmarks/exps/old/card241.py diff --git a/benchmarks/exps/card249.py b/benchmarks/exps/old/card249.py similarity index 100% rename from benchmarks/exps/card249.py rename to benchmarks/exps/old/card249.py diff --git a/benchmarks/exps/card250.py b/benchmarks/exps/old/card250.py similarity index 100% rename from benchmarks/exps/card250.py rename to benchmarks/exps/old/card250.py diff --git a/benchmarks/exps/card251.py b/benchmarks/exps/old/card251.py similarity index 100% rename from benchmarks/exps/card251.py rename to benchmarks/exps/old/card251.py diff --git a/benchmarks/exps/card252.py b/benchmarks/exps/old/card252.py similarity index 100% rename from benchmarks/exps/card252.py rename to benchmarks/exps/old/card252.py diff --git a/benchmarks/exps/card259.py b/benchmarks/exps/old/card259.py similarity index 100% rename from benchmarks/exps/card259.py rename to benchmarks/exps/old/card259.py diff --git a/benchmarks/exps/card262.py b/benchmarks/exps/old/card262.py similarity index 100% rename from benchmarks/exps/card262.py rename to benchmarks/exps/old/card262.py diff --git a/benchmarks/exps/card266.py b/benchmarks/exps/old/card266.py similarity index 100% rename from benchmarks/exps/card266.py rename to benchmarks/exps/old/card266.py diff --git a/benchmarks/exps/card270.py b/benchmarks/exps/old/card270.py similarity index 100% rename from benchmarks/exps/card270.py rename to benchmarks/exps/old/card270.py diff --git a/benchmarks/exps/card271.py b/benchmarks/exps/old/card271.py similarity index 100% rename from benchmarks/exps/card271.py rename to benchmarks/exps/old/card271.py diff --git a/benchmarks/exps/card272.py b/benchmarks/exps/old/card272.py similarity index 100% rename from benchmarks/exps/card272.py rename to benchmarks/exps/old/card272.py diff --git a/benchmarks/exps/card275.py b/benchmarks/exps/old/card275.py similarity index 100% rename from benchmarks/exps/card275.py rename to benchmarks/exps/old/card275.py diff --git a/benchmarks/exps/exp10.py b/benchmarks/exps/old/exp10.py similarity index 100% rename from benchmarks/exps/exp10.py rename to benchmarks/exps/old/exp10.py diff --git a/benchmarks/exps/exp11.py b/benchmarks/exps/old/exp11.py similarity index 100% rename from benchmarks/exps/exp11.py rename to benchmarks/exps/old/exp11.py diff --git a/benchmarks/exps/exp12.py b/benchmarks/exps/old/exp12.py similarity index 100% rename from benchmarks/exps/exp12.py rename to benchmarks/exps/old/exp12.py diff --git a/benchmarks/exps/exp13.py b/benchmarks/exps/old/exp13.py similarity index 100% rename from benchmarks/exps/exp13.py rename to benchmarks/exps/old/exp13.py diff --git a/benchmarks/exps/exp14.py b/benchmarks/exps/old/exp14.py similarity index 100% rename from benchmarks/exps/exp14.py rename to benchmarks/exps/old/exp14.py diff --git a/benchmarks/exps/exp15.py b/benchmarks/exps/old/exp15.py similarity index 100% rename from benchmarks/exps/exp15.py rename to benchmarks/exps/old/exp15.py diff --git a/benchmarks/exps/exp18.py b/benchmarks/exps/old/exp18.py similarity index 100% rename from benchmarks/exps/exp18.py rename to benchmarks/exps/old/exp18.py diff --git a/benchmarks/exps/exp19.py b/benchmarks/exps/old/exp19.py similarity index 100% rename from benchmarks/exps/exp19.py rename to benchmarks/exps/old/exp19.py diff --git a/benchmarks/exps/exp3.py b/benchmarks/exps/old/exp3.py similarity index 100% rename from benchmarks/exps/exp3.py rename to benchmarks/exps/old/exp3.py diff --git a/benchmarks/exps/exp6_2.py b/benchmarks/exps/old/exp6_2.py similarity index 100% rename from benchmarks/exps/exp6_2.py rename to benchmarks/exps/old/exp6_2.py diff --git a/benchmarks/exps/exp9.py b/benchmarks/exps/old/exp9.py similarity index 100% rename from benchmarks/exps/exp9.py rename to benchmarks/exps/old/exp9.py diff --git a/benchmarks/exps/fairnessjct.py b/benchmarks/exps/old/fairnessjct.py similarity index 100% rename from benchmarks/exps/fairnessjct.py rename to benchmarks/exps/old/fairnessjct.py diff --git a/benchmarks/exps/gperf.py b/benchmarks/exps/old/gperf.py similarity index 100% rename from benchmarks/exps/gperf.py rename to benchmarks/exps/old/gperf.py diff --git a/benchmarks/exps/jct.py b/benchmarks/exps/old/jct.py similarity index 100% rename from benchmarks/exps/jct.py rename to benchmarks/exps/old/jct.py diff --git a/benchmarks/exps/mem.py b/benchmarks/exps/old/mem.py similarity index 100% rename from benchmarks/exps/mem.py rename to benchmarks/exps/old/mem.py diff --git a/benchmarks/exps/memop.py b/benchmarks/exps/old/memop.py similarity index 100% rename from benchmarks/exps/memop.py rename to benchmarks/exps/old/memop.py diff --git a/benchmarks/exps/mnist.py b/benchmarks/exps/old/mnist.py similarity index 100% rename from benchmarks/exps/mnist.py rename to benchmarks/exps/old/mnist.py diff --git a/benchmarks/exps/mnistnv.py b/benchmarks/exps/old/mnistnv.py similarity index 100% rename from benchmarks/exps/mnistnv.py rename to benchmarks/exps/old/mnistnv.py diff --git a/benchmarks/exps/one.py b/benchmarks/exps/old/one.py similarity index 100% rename from benchmarks/exps/one.py rename to benchmarks/exps/old/one.py diff --git a/benchmarks/exps/onetf.py b/benchmarks/exps/old/onetf.py similarity index 100% rename from benchmarks/exps/onetf.py rename to benchmarks/exps/old/onetf.py diff --git a/benchmarks/exps/optracing.py b/benchmarks/exps/old/optracing.py similarity index 100% rename from benchmarks/exps/optracing.py rename to benchmarks/exps/old/optracing.py diff --git a/benchmarks/exps/paral.py b/benchmarks/exps/old/paral.py similarity index 100% rename from benchmarks/exps/paral.py rename to benchmarks/exps/old/paral.py diff --git a/benchmarks/exps/paral2.py b/benchmarks/exps/old/paral2.py similarity index 100% rename from benchmarks/exps/paral2.py rename to benchmarks/exps/old/paral2.py diff --git a/benchmarks/exps/paraltf.py b/benchmarks/exps/old/paraltf.py similarity index 100% rename from benchmarks/exps/paraltf.py rename to benchmarks/exps/old/paraltf.py diff --git a/benchmarks/exps/run5.py b/benchmarks/exps/old/run5.py similarity index 100% rename from benchmarks/exps/run5.py rename to benchmarks/exps/old/run5.py diff --git a/benchmarks/exps/save_train_dir.py b/benchmarks/exps/old/save_train_dir.py similarity index 87% rename from benchmarks/exps/save_train_dir.py rename to benchmarks/exps/old/save_train_dir.py index c3ed2fd..96398af 100644 --- a/benchmarks/exps/save_train_dir.py +++ b/benchmarks/exps/old/save_train_dir.py @@ -77,20 +77,18 @@ def do_mem(logdir, network, batch_size): logger.info(f'Saving model checkpoint for {network}_{batch_size} for {batch_num} iter') - final_dst = logdir / 'tf' / WTL.from_name(network).canonical_name(RunConfig(batch_size, batch_num, None)) + final_dst = logdir / WTL.from_name(network).canonical_name(RunConfig(batch_size, batch_num, None)) with atomic_directory(final_dst) as outputdir: logger.info(' Running on TF') wl = WTL.create(network, batch_size, batch_num, Executor.TF) wl.env['SALUS_SAVE_MODEL'] = '1' + + model_dir = pathlib.Path('~/../symbiotic/peifeng/tf_cnn_benchmarks_models/legacy_checkpoint_models') + model_dir = model_dir.expanduser().resolve() + wl.env['SALUS_TFBENCH_EVAL_MODEL_DIR'] = str(model_dir) + run_tf(outputdir, wl) - # filter and move file to a more convinent name - for f in pathlib.Path(outputdir).iterdir(): - with f.with_name('alloc.output').open('w') as file: - grep = execute(['egrep', r"] (\+|-)", f.name], stdout=file, cwd=str(f.parent)) - grep.wait() - f.unlink() - break return final_dst diff --git a/benchmarks/exps/serialmany.py b/benchmarks/exps/old/serialmany.py similarity index 100% rename from benchmarks/exps/serialmany.py rename to benchmarks/exps/old/serialmany.py diff --git a/benchmarks/exps/two.py b/benchmarks/exps/old/two.py similarity index 100% rename from benchmarks/exps/two.py rename to benchmarks/exps/old/two.py From 17bb16cd39752002e5f06e8d4bb588b3d7e7017b Mon Sep 17 00:00:00 2001 From: Peifeng Yu Date: Fri, 5 Apr 2019 20:33:39 -0400 Subject: [PATCH 07/13] Don't print warning is the value is the same --- src/oplibraries/tensorflow/v3/smblocker.cpp | 2 +- src/oplibraries/tensorflow/v3/smblocker.h | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/oplibraries/tensorflow/v3/smblocker.cpp b/src/oplibraries/tensorflow/v3/smblocker.cpp index 1978099..b99e36f 100644 --- a/src/oplibraries/tensorflow/v3/smblocker.cpp +++ b/src/oplibraries/tensorflow/v3/smblocker.cpp @@ -119,7 +119,7 @@ void SMBlocker::saveCurrentThreadResults(uint64_t graphId, int nodeId) } auto &usage = m_cache[std::make_pair(graphId, nodeId)]; - if (usage.blockCount != 0 || usage.threadPerBlock != 0) { + if ((usage.blockCount != 0 || usage.threadPerBlock != 0) && usage != newUsage) { LOG(WARNING) << "Overriding SM usage for graph " << graphId << " node " << nodeId << ", previous: blk=" << usage.blockCount << " thd=" << usage.threadPerBlock << ", new: blk=" << newUsage.blockCount << " thd=" << newUsage.threadPerBlock; diff --git a/src/oplibraries/tensorflow/v3/smblocker.h b/src/oplibraries/tensorflow/v3/smblocker.h index 003afa4..dca7bfc 100644 --- a/src/oplibraries/tensorflow/v3/smblocker.h +++ b/src/oplibraries/tensorflow/v3/smblocker.h @@ -42,6 +42,14 @@ struct SMUsage { uint64_t threadPerBlock = 0; uint64_t blockCount = 0; + + bool operator ==(const SMUsage &other) { + return threadPerBlock == other.threadPerBlock && blockCount == other.blockCount; + } + + bool operator !=(const SMUsage &other) { + return !(*this == other); + } }; class SMBlocker From 78562d916de42f9a4accca0b625e7afaa8df528b Mon Sep 17 00:00:00 2001 From: Peifeng Yu Date: Mon, 8 Apr 2019 22:58:23 -0400 Subject: [PATCH 08/13] Benchmark driver for TFWeb workloads --- benchmarks/driver/runner.py | 97 +++++++++++++++++++++++++++++++++++ benchmarks/driver/tfserver.py | 4 +- benchmarks/driver/workload.py | 26 +++++++++- 3 files changed, 124 insertions(+), 3 deletions(-) diff --git a/benchmarks/driver/runner.py b/benchmarks/driver/runner.py index df09359..eef3249 100644 --- a/benchmarks/driver/runner.py +++ b/benchmarks/driver/runner.py @@ -40,6 +40,11 @@ flags.DEFINE_string('tfbench_base', '../tf_benchmarks', 'Base dir of TFBenchmark based workloads') flags.DEFINE_string('unit_base', 'tests', 'Base dir of unittest based workloads') flags.DEFINE_string('fathom_base', '../fathom', 'Base dir of Fathom based workloads') +flags.DEFINE_string('tfweb_base', '../tfweb', 'Base dir of TFWeb based workloads') +flags.DEFINE_string('tfweb_saved_model_dir', '~/../symbiotic/peifeng/tf_cnn_benchmarks_models/saved_models', + 'SavedModel dir of TFWeb based workloads') +flags.DEFINE_string('tfweb_request_body_dir', '~/../symbiotic/peifeng/tf_cnn_benchmarks_models/reqeusts', + 'Predefined request body dir for TFWeb based workloads') flags.DEFINE_boolean('no_capture', False, 'Do not capture workload outputs') @@ -280,3 +285,95 @@ def __call__(self, executor, output_file): output_file.parent.mkdir(exist_ok=True, parents=True) with output_file.open('w') as f: return execute(cmd, cwd=str(cwd), env=self.env, stdout=f, stderr=sp.STDOUT) + + +class TFWebRunner(Runner): + """ + Run a TFWeb based inference job + + We start several servers and a balancer on the same node. + The server commandline: tfweb --model=path/to/saved_model/network --sess_target=... + The client commandline: gobetween from-file xxx.toml + """ + + def __init__(self, wl, base_dir=None): + super().__init__(wl) + self.base_dir = base_dir + if self.base_dir is None: + self.base_dir = FLAGS.tfweb_base + + def __call__(self, executor, output_file): + # type: (Executor, Path) -> Popen + model_name = self.wl.name.rstrip('web') + cwd = self.base_dir + cmd = [ + 'stdbuf', '-o0', '-e0', '--', + 'examples/cluster/start_cluster', + '--model="{}"'.format(str(Path(FLAGS.tfweb_saved_model_dir).joinpath(model_name))), + ] + + if executor == Executor.Salus: + cmd += [ + '--sess_target', SalusServer.current_server().endpoint, + ] + elif executor == Executor.TF: + cmd += [ + '--sess_target', '""', + ] + elif executor == Executor.TFDist: + cmd += [ + '--sess_target', TFDistServer.current_server().endpoint, + ] + else: + raise ValueError(f'Unknown executor: {executor}') + + num_replicas = self.wl.env.pop('SALUS_TFWEB_REPLICAS', '1') + cmd += [ + '--num_replicas', num_replicas + ] + + if FLAGS.no_capture: + return execute(cmd, cwd=str(cwd), env=self.env) + else: + output_file.parent.mkdir(exist_ok=True, parents=True) + with output_file.open('w') as f: + return execute(cmd, cwd=str(cwd), env=self.env, stdout=f, stderr=sp.STDOUT) + + +class TFWebClientRunner(Runner): + """ + Run a tfweb client attacker. + Command: examples/cluster/tfweb-client TARGET REQ_BODY PLANTXT + """ + + def __init__(self, wl, base_dir=None): + super().__init__(wl) + self.base_dir = base_dir + if self.base_dir is None: + self.base_dir = FLAGS.tfweb_base + + def __call__(self, executor, output_file): + # type: (Executor, Path) -> Popen + + model_name = self.wl.name.rstrip('client') + + cwd = self.base_dir + cmd = [ + 'stdbuf', '-o0', '-e0', '--', + 'examples/tfweb-client', + '-output', str(output_file), + self.wl.target, + # request body + str(Path(FLAGS.tfweb_request_body_dir).joinpath(model_name).with_suffix('.txt')), + # always write plan to stdin + '-', + ] + + proc = execute(cmd, cwd=str(cwd), env=self.env, stdin=sp.PIPE) + proc.stdin.write(self._plan_to_bytes()) + proc.stdin.close() + return proc + + def _plan_to_bytes(self): + return ' '.join(self.wl.plan).encode('utf-8') + diff --git a/benchmarks/driver/tfserver.py b/benchmarks/driver/tfserver.py index 8c58f08..7e505b3 100644 --- a/benchmarks/driver/tfserver.py +++ b/benchmarks/driver/tfserver.py @@ -29,7 +29,7 @@ from datetime import datetime from absl import flags from contextlib import contextmanager -from typing import List, Deque, Dict, Union +from typing import List, Deque, Dict, Union, Optional from benchmarks.driver.utils import prompt, remove_prefix from benchmarks.driver.utils.prompt import pause @@ -65,7 +65,7 @@ def __init__(self, env=None, outputdir=None): self._build_cmd() - self.proc = None # type: Popen + self.proc = None # type: Optional[Popen] def _build_cmd(self): # type: () -> List[str] diff --git a/benchmarks/driver/workload.py b/benchmarks/driver/workload.py index 0250bc3..6c8002d 100644 --- a/benchmarks/driver/workload.py +++ b/benchmarks/driver/workload.py @@ -28,7 +28,7 @@ from typing import Dict, Iterable, Type, Union from .runner import Runner, RunConfig, Popen, Executor -from .runner import TFBenchmarkRunner, UnittestRunner, FathomRunner +from .runner import TFBenchmarkRunner, UnittestRunner, FathomRunner, TFWebRunner, TFWebClientRunner from .utils import try_with_default, kill_tree, unique from .utils.compatiblity import pathlib @@ -450,6 +450,30 @@ def run(self, output_file): WorkloadTemplate.define('superreseval', [1, 5, 10], UnittestRunner) WorkloadTemplate.define('seq2seqeval', ['small', 'medium', 'large'], UnittestRunner) +WorkloadTemplate.define('vgg11web', [1], TFWebRunner) +WorkloadTemplate.define('vgg16web', [1], TFWebRunner) +WorkloadTemplate.define('vgg19web', [1], TFWebRunner) +WorkloadTemplate.define('resnet50web', [1], TFWebRunner) +WorkloadTemplate.define('resnet101web', [1], TFWebRunner) +WorkloadTemplate.define('resnet152web', [1], TFWebRunner) +WorkloadTemplate.define('googlenetweb', [1], TFWebRunner) +WorkloadTemplate.define('alexnetweb', [1], TFWebRunner) +WorkloadTemplate.define('overfeatweb', [1], TFWebRunner) +WorkloadTemplate.define('inception3web', [1], TFWebRunner) +WorkloadTemplate.define('inception4web', [1], TFWebRunner) + +WorkloadTemplate.define('vgg11client', [1], TFWebClientRunner) +WorkloadTemplate.define('vgg16client', [1], TFWebClientRunner) +WorkloadTemplate.define('vgg19client', [1], TFWebClientRunner) +WorkloadTemplate.define('resnet50client', [1], TFWebClientRunner) +WorkloadTemplate.define('resnet101client', [1], TFWebClientRunner) +WorkloadTemplate.define('resnet152client', [1], TFWebClientRunner) +WorkloadTemplate.define('googlenetclient', [1], TFWebClientRunner) +WorkloadTemplate.define('alexnetclient', [1], TFWebClientRunner) +WorkloadTemplate.define('overfeatclient', [1], TFWebClientRunner) +WorkloadTemplate.define('inception3client', [1], TFWebClientRunner) +WorkloadTemplate.define('inception4client', [1], TFWebClientRunner) + # noinspection PyUnusedLocal def _disable_init(self): From 698f3e154ddf35cf8b4592cf1c721797c219f4cc Mon Sep 17 00:00:00 2001 From: Peifeng Yu Date: Fri, 19 Apr 2019 15:30:10 -0400 Subject: [PATCH 09/13] Inference workload --- benchmarks/driver/runner.py | 85 ++++++++-- benchmarks/driver/server/__init__.py | 2 + benchmarks/driver/utils/utils.py | 9 ++ benchmarks/driver/workload.py | 3 - benchmarks/exps/__init__.py | 18 +++ benchmarks/exps/bs_lat_tput.py | 80 ++++++++++ benchmarks/exps/card304.py | 110 +++++++++++++ benchmarks/exps/card309.py | 147 ++++++++++++++++++ src/oplibraries/tensorflow/tfinstance.cpp | 10 +- src/oplibraries/tensorflow/tfutils.h | 8 + src/oplibraries/tensorflow/v3/smblocker.cpp | 33 ++-- src/oplibraries/tensorflow/v3/smblocker.h | 22 ++- src/oplibraries/tensorflow/v3/tf_executor.cpp | 18 ++- src/rpcserver/zmqserver.cpp | 3 + src/utils/threadutils.h | 74 +++++++++ tests/test_tf/lib/tfhelper.py | 8 + tests/test_tf/test_super_res.py | 4 +- tests/test_tf/test_vae.py | 9 +- 18 files changed, 609 insertions(+), 34 deletions(-) create mode 100644 benchmarks/exps/bs_lat_tput.py create mode 100644 benchmarks/exps/card304.py create mode 100644 benchmarks/exps/card309.py diff --git a/benchmarks/driver/runner.py b/benchmarks/driver/runner.py index eef3249..42482d1 100644 --- a/benchmarks/driver/runner.py +++ b/benchmarks/driver/runner.py @@ -30,7 +30,7 @@ from .server import SalusServer from .tfserver import TFDistServer -from .utils import Popen, execute, snake_to_pascal, str2bool +from .utils import Popen, execute, snake_to_pascal, str2bool, remove_suffix from .utils.compatiblity import pathlib, subprocess as sp Path = pathlib.Path @@ -123,14 +123,26 @@ def __call__(self, executor, output_file): eval_block = self.wl.env.pop('SALUS_TFBENCH_EVAL_BLOCK', 'true') eval_model_dir = self.wl.env.pop('SALUS_TFBENCH_EVAL_MODEL_DIR', 'models') - eval_model_dir = str(Path(eval_model_dir).joinpath(self.wl.name.rstrip('eval'))) + eval_model_dir = str(Path(eval_model_dir).joinpath(remove_suffix(self.wl.name, 'eval'))) eval_saved_model_dir = self.wl.env.pop('SALUS_TFBENCH_EVAL_SAVED_MODEL_DIR', None) if eval_saved_model_dir is not None: - eval_saved_model_dir = str(Path(eval_saved_model_dir).joinpath(self.wl.name.rstrip('eval'))) + eval_saved_model_dir = str(Path(eval_saved_model_dir).joinpath(remove_suffix(self.wl.name, 'eval'))) + + num_seconds = self.wl.env.pop('SALUS_ITER_SECONDS', None) + if num_seconds is not None: + cmd += [ + '--num_seconds={}'.format(num_seconds) + ] + + wait_for_signal = self.wl.env.pop('SALUS_WAIT_FOR_SIGNAL', None) + if wait_for_signal is not None: + cmd += [ + '--wait_for_signal={}'.format(wait_for_signal) + ] if self.wl.name.endswith('eval'): - model_name = self.wl.name.rsplit('eval')[0] + model_name = remove_suffix(self.wl.name, 'eval') cmd += [ '--model_dir=' + eval_model_dir, '--model={}'.format(model_name), @@ -174,6 +186,7 @@ def __call__(self, executor, output_file): # type: (Executor, Path) -> Popen env = self.env.copy() env['EXEC_ITER_NUMBER'] = str(self.wl.batch_num) + env['SALUS_BATCH_SIZE'] = str(self.wl.batch_size) if executor == Executor.TFDist: env['SALUS_TFDIST_ENDPOINT'] = TFDistServer.current_server().endpoint @@ -214,6 +227,12 @@ def _construct_test_name(self, executor): }) } + variable_batch_size_models = {'vae', 'superres'} + if remove_suffix(self.wl.name, 'eval') not in variable_batch_size_models: + if self.wl.batch_size not in self.wl.wtl.available_batch_sizes(): + raise ValueError(f"Batch size `{self.wl.batch_size}' is not supported for {self.wl.name}," + f" available ones: {self.wl.wtl.available_batch_sizes()}") + if executor == Executor.Salus: prefix = 'test_rpc_' elif executor == Executor.TF: @@ -226,7 +245,7 @@ def _construct_test_name(self, executor): if self.wl.name.endswith('eval'): prefix += 'eval_' - model_name = self.wl.name.rsplit('eval')[0] + model_name = remove_suffix(self.wl.name, 'eval') if model_name in supported_model: pkg, cls, names = supported_model[model_name] @@ -234,11 +253,16 @@ def _construct_test_name(self, executor): # fallback to guessing pkg = f'test_tf.test_{model_name}' cls = f'Test{snake_to_pascal(model_name)}' + + # get method name names = { s: str(idx) for idx, s in enumerate(self.wl.wtl.available_batch_sizes()) } - method = f'{cls}.{prefix}{names[self.wl.batch_size]}' + + postfix = names.get(self.wl.batch_size, '0') + + method = f'{cls}.{prefix}{postfix}' return pkg, method @@ -257,7 +281,7 @@ def __call__(self, executor, output_file): cmd = [ 'stdbuf', '-o0', '-e0', '--', 'python', '-m', 'fathom.cli', - '--workload', self.wl.name.rsplit('eval')[0], + '--workload', remove_suffix(self.wl.name, 'eval'), '--action', 'test' if self.wl.name.endswith('eval') else 'train', '--num_iters', str(self.wl.batch_num), '--batch_size', str(self.wl.batch_size), @@ -287,6 +311,49 @@ def __call__(self, executor, output_file): return execute(cmd, cwd=str(cwd), env=self.env, stdout=f, stderr=sp.STDOUT) +class TFWebDirectRunner(Runner): + """Using TFWeb's load infrastructure to directly run""" + + def __init__(self, wl, base_dir=None): + super().__init__(wl) + self.base_dir = base_dir + if self.base_dir is None: + self.base_dir = FLAGS.tfweb_base + + def __call__(self, executor, output_file): + model_name = remove_suffix(self.wl.name, 'eval') + cwd = self.base_dir + cmd = [ + 'stdbuf', '-o0', '-e0', '--', + 'examples/direct/client', + '--model="{}"'.format(str(Path(FLAGS.tfweb_saved_model_dir).joinpath(model_name))), + '--batch_size={}'.format(self.wl.batch_size), + '--batch_num={}'.format(self.wl.batch_num), + ] + + if executor == Executor.Salus: + cmd += [ + '--sess_target', SalusServer.current_server().endpoint, + ] + elif executor == Executor.TF: + cmd += [ + '--sess_target', '""', + ] + elif executor == Executor.TFDist: + cmd += [ + '--sess_target', TFDistServer.current_server().endpoint, + ] + else: + raise ValueError(f'Unknown executor: {executor}') + + if FLAGS.no_capture: + return execute(cmd, cwd=str(cwd), env=self.env) + else: + output_file.parent.mkdir(exist_ok=True, parents=True) + with output_file.open('w') as f: + return execute(cmd, cwd=str(cwd), env=self.env, stdout=f, stderr=sp.STDOUT) + + class TFWebRunner(Runner): """ Run a TFWeb based inference job @@ -304,7 +371,7 @@ def __init__(self, wl, base_dir=None): def __call__(self, executor, output_file): # type: (Executor, Path) -> Popen - model_name = self.wl.name.rstrip('web') + model_name = remove_suffix(self.wl.name, 'web') cwd = self.base_dir cmd = [ 'stdbuf', '-o0', '-e0', '--', @@ -355,7 +422,7 @@ def __init__(self, wl, base_dir=None): def __call__(self, executor, output_file): # type: (Executor, Path) -> Popen - model_name = self.wl.name.rstrip('client') + model_name = remove_suffix(self.wl.name, 'client') cwd = self.base_dir cmd = [ diff --git a/benchmarks/driver/server/__init__.py b/benchmarks/driver/server/__init__.py index 4e6bd8d..2617f42 100644 --- a/benchmarks/driver/server/__init__.py +++ b/benchmarks/driver/server/__init__.py @@ -77,9 +77,11 @@ def _find_executable(self): """Find the absolute path to server executable, according to 'config.build_type'""" candidates = [ self.config.build_dir / self.config.build_type / 'src' / 'executor', + self.config.build_dir / self.config.build_type / 'src' / 'salus-server', self.config.build_dir / self.config.build_type / 'bin' / 'executor', self.config.build_dir / self.config.build_type / 'bin' / 'salus-server', self.config.build_dir / self.config.build_type.lower() / 'src' / 'executor', + self.config.build_dir / self.config.build_type.lower() / 'src' / 'salus-server', self.config.build_dir / self.config.build_type.lower() / 'bin' / 'executor', self.config.build_dir / self.config.build_type.lower() / 'bin' / 'salus-server', ] diff --git a/benchmarks/driver/utils/utils.py b/benchmarks/driver/utils/utils.py index dd50486..98165cb 100644 --- a/benchmarks/driver/utils/utils.py +++ b/benchmarks/driver/utils/utils.py @@ -86,6 +86,14 @@ def remove_prefix(text, prefix): return text # or whatever +def remove_suffix(text, suffix): + # type: (str, str) -> str + """Remove suffix from text if any""" + if text.endswith(suffix): + return text[:len(text)-len(suffix)] + return text + + def try_with_default(func, default=None, ignore=Exception): """ A wrapper that ignores exception from a function. """ @@ -301,6 +309,7 @@ def str2bool(v): 'eprint', 'remove_none', 'remove_prefix', + 'remove_suffix', 'try_with_default', 'execute', 'kill_tree', diff --git a/benchmarks/driver/workload.py b/benchmarks/driver/workload.py index 6c8002d..93f9489 100644 --- a/benchmarks/driver/workload.py +++ b/benchmarks/driver/workload.py @@ -151,9 +151,6 @@ def _create(self, batch_size, batch_num, executor=Executor.Salus): def _create_from_rcfg(self, rcfg, executor=Executor.Salus): # type: (RunConfig, Executor) -> Workload - if rcfg.batch_size not in self.available_batch_sizes(): - raise ValueError(f"Batch size `{rcfg.batch_size}' is not supported for {self.name}," - f" available ones: {self.available_batch_sizes()}") return Workload(self, rcfg, executor, self.geometry(rcfg, executor).copy()) @classmethod diff --git a/benchmarks/exps/__init__.py b/benchmarks/exps/__init__.py index 2ba3001..41187ab 100644 --- a/benchmarks/exps/__init__.py +++ b/benchmarks/exps/__init__.py @@ -356,3 +356,21 @@ def expandbs(name): wls = itertools.product(wls, batch_num, executor) return [WTL.create(name, bs, bn, ex) for (name, bs), bn, ex in wls] + + +def wait_on_pipe(pipe): + logger.info(f'Waiting workload to be ready on {pipe}') + with open(pipe, 'rb') as f: + f.read(1) + + +def release_on_pipe(pipe): + logger.info(f'Signaling workload to continue on {pipe}') + with open(pipe, 'wb') as f: + f.write(b"a") + logger.info(f'Workload continued on {pipe}') + + +def sync_on_pipe(pipe): + wait_on_pipe(pipe) + release_on_pipe(pipe) diff --git a/benchmarks/exps/bs_lat_tput.py b/benchmarks/exps/bs_lat_tput.py new file mode 100644 index 0000000..e472b0e --- /dev/null +++ b/benchmarks/exps/bs_lat_tput.py @@ -0,0 +1,80 @@ +# -*- coding: future_fstrings -*- +# +# Copyright 2019 Peifeng Yu +# +# This file is part of Salus +# (see https://github.com/SymbioticLab/Salus). +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +""" +Measure the throughtput and latency of each batch size. + +Collected data: model, batch_size, latency, throughtput (in 2 min) +""" +from __future__ import absolute_import, print_function, division, unicode_literals + +from absl import flags +from typing import Sequence +import logging + +from benchmarks.driver.runner import TFBenchmarkRunner +from benchmarks.driver.server.config import presets +from benchmarks.driver.utils.compatiblity import pathlib +from benchmarks.driver.workload import Executor, WTL +from benchmarks.exps import run_tf, select_workloads, run_seq, maybe_forced_preset + + +FLAGS = flags.FLAGS +logger = logging.getLogger(__name__) + + +def set_env(wl): + wl.env['SALUS_TFBENCH_EVAL_INTERVAL'] = '0' + wl.env['SALUS_TFBENCH_EVAL_RAND_FACTOR'] = '0' + wl.env['SALUS_TFBENCH_EVAL_BLOCK'] = 'true' + + model_dir = pathlib.Path('~/../symbiotic/peifeng/tf_cnn_benchmarks_models/legacy_checkpoint_models') + model_dir = model_dir.expanduser().resolve() + wl.env['SALUS_TFBENCH_EVAL_MODEL_DIR'] = model_dir + + +def do_measure(scfg, name, batch_sizes): + batch_num = 100 + # batch_sizes = [1, 2, 4, 8, 16, 32] + # batch_sizes = [1024, 1536, 2048, 4096] + for bs in batch_sizes: + wl = WTL.create(name, bs, batch_num, executor=Executor.Salus) + set_env(wl) + run_seq(scfg.copy(output_dir=FLAGS.save_dir / "salus"), wl) + + wl = WTL.create(name, bs, batch_num, executor=Executor.TF) + set_env(wl) + run_seq(scfg.copy(output_dir=FLAGS.save_dir / "tf"), wl) + + +def main(argv): + # type: (Sequence[str]) -> None + scfg = maybe_forced_preset(presets.MostEfficient) + scfg.logconf = 'disable' + + name = "alexnet" + if len(argv) > 1: + name = argv[0] + batch_sizes = [int(v) for v in argv[1:]] + + if not batch_sizes: + batch_sizes = [1, 2, 4, 8] + + do_measure(scfg, name, batch_sizes) + diff --git a/benchmarks/exps/card304.py b/benchmarks/exps/card304.py new file mode 100644 index 0000000..bd193ef --- /dev/null +++ b/benchmarks/exps/card304.py @@ -0,0 +1,110 @@ +# -*- coding: future_fstrings -*- +# +# Copyright 2019 Peifeng Yu +# +# This file is part of Salus +# (see https://github.com/SymbioticLab/Salus). +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +""" +Card 304: Experiment one inference job with one training job + +Record inference latency. Compare inference job latency running along vs. running with a training job. + +The latency should be measured with increasing throughput (qps) for the inference job. + +Collected data: inference per iteration speed (latency), training throughput (derived from per iteration speed) +""" +from __future__ import absolute_import, print_function, division, unicode_literals + +import tempfile +from typing import Sequence + +from absl import flags +import logging +import os + +from benchmarks.driver.server.config import presets +from benchmarks.driver.workload import WTL, Executor +from benchmarks.driver.utils.compatiblity import pathlib +from benchmarks.exps import run_seq, maybe_forced_preset, RunFn, sync_on_pipe, Pause, wait_on_pipe, release_on_pipe + + +FLAGS = flags.FLAGS +logger = logging.getLogger(__name__) + + +def set_env(wl): + wl.env['SALUS_TFBENCH_EVAL_INTERVAL'] = '0' + wl.env['SALUS_TFBENCH_EVAL_RAND_FACTOR'] = '0' + wl.env['SALUS_TFBENCH_EVAL_BLOCK'] = 'true' + + model_dir = pathlib.Path('~/../symbiotic/peifeng/tf_cnn_benchmarks_models/legacy_checkpoint_models') + model_dir = model_dir.expanduser().resolve() + wl.env['SALUS_TFBENCH_EVAL_MODEL_DIR'] = model_dir + + +def main(argv): + # type: (Sequence[str]) -> None + scfg = maybe_forced_preset(presets.MostEfficient) + scfg.logconf = 'disable' + + name = "alexneteval" + if len(argv) > 1: + name = argv[0] + batch_sizes = [int(v) for v in argv[1:]] + + if not batch_sizes: + batch_sizes = [1, 2, 4, 8] + + batch_num = 300 + # batch_sizes = [1, 2, 4, 8, 16, 32] + # batch_sizes = [1024, 1536, 2048, 4096] + for idx, bs in enumerate(batch_sizes): + with tempfile.TemporaryDirectory() as td: + # create a background training job, the batch number has no effect here, + # only used to distinguish different runs + trainWl = WTL.create('inception4', 50, 100 + idx, executor=Executor.Salus) + # make sure it runs long enough + trainWl.env['SALUS_ITER_SECONDS'] = '300' + + # create a pipe to signal trainWl + pipetrain = str(pathlib.Path(td).joinpath('fifotrain')) + os.mkfifo(pipetrain) + trainWl.env['SALUS_WAIT_FOR_SIGNAL'] = pipetrain + + # create the foreground inference job + wl = WTL.create(name, bs, batch_num, executor=Executor.Salus) + set_env(wl) + wl.env['SALUS_ITER_SECONDS'] = '150' + + pipe = str(pathlib.Path(td).joinpath('fifo')) + os.mkfifo(pipe) + wl.env['SALUS_WAIT_FOR_SIGNAL'] = pipe + + run_seq(scfg.copy(output_dir=FLAGS.save_dir / (name + "-inception4")), + trainWl, # start the background job + wl, # start the foreground job + # wait for both jobs to be ready + RunFn(lambda *args, **kwargs: wait_on_pipe(pipetrain)), + RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)), + # start train job + RunFn(lambda *args, **kwargs: release_on_pipe(pipetrain)), + # wait 10 seconds + Pause(10), + # release inference job + RunFn(lambda *args, **kwargs: release_on_pipe(pipe)), + # run_seq automatically join all jobs at the end of the sequence + ) + diff --git a/benchmarks/exps/card309.py b/benchmarks/exps/card309.py new file mode 100644 index 0000000..bfa3347 --- /dev/null +++ b/benchmarks/exps/card309.py @@ -0,0 +1,147 @@ +# -*- coding: future_fstrings -*- +# +# Copyright 2019 Peifeng Yu +# +# This file is part of Salus +# (see https://github.com/SymbioticLab/Salus). +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +""" +Card 309: Experiment multiple inference jobs together + +Record inference latency. Compare inference job latency running along vs. running with others. + +The latency should be measured with increasing throughput (qps) for the inference job. + +Collected data: + - inference per iteration speed (latency) + - training throughput (derived from per iteration speed and batch size) +""" +from __future__ import absolute_import, print_function, division, unicode_literals + +import os +import tempfile +from typing import Sequence + +from absl import flags +import logging + +from benchmarks.driver.server.config import presets +from benchmarks.driver.workload import WTL, Executor +from benchmarks.driver.utils.compatiblity import pathlib +from benchmarks.exps import ( + run_seq, maybe_forced_preset, case_switch_main, RunFn, sync_on_pipe, wait_on_pipe, + release_on_pipe, +) + + +FLAGS = flags.FLAGS +logger = logging.getLogger(__name__) + +flags.DEFINE_integer('num_replicas', 1, 'Number of replicas to run concurrently') + + +def set_env(wl): + wl.env['SALUS_TFBENCH_EVAL_INTERVAL'] = '0' + wl.env['SALUS_TFBENCH_EVAL_RAND_FACTOR'] = '0' + wl.env['SALUS_TFBENCH_EVAL_BLOCK'] = 'true' + + model_dir = pathlib.Path('~/../symbiotic/peifeng/tf_cnn_benchmarks_models/legacy_checkpoint_models') + model_dir = model_dir.expanduser().resolve() + wl.env['SALUS_TFBENCH_EVAL_MODEL_DIR'] = model_dir + + # run for 60 seconds, ignoring BATCH_NUM + wl.env['SALUS_ITER_SECONDS'] = '120' + + +def do_inferences(scfg, names, batch_sizes): + batch_num = 100 + # batch_sizes = [1, 2, 4, 8, 16, 32] + # batch_sizes = [1024, 1536, 2048, 4096] + for bs in batch_sizes: + wls = [] + for name in names: + if not name.endswith('eval'): + raise ValueError('Not an inference workload!!!') + wl = WTL.create(name, bs, batch_num, executor=Executor.Salus) + set_env(wl) + wls.append(wl) + + run_seq(scfg, *wls) + + +def same(argv): + # type: (Sequence[str]) -> None + scfg = maybe_forced_preset(presets.MostEfficient) + scfg.logconf = 'disable' + + name = "alexneteval" + if len(argv) > 1: + name = argv[0] + batch_sizes = [int(v) for v in argv[1:]] + + if not batch_sizes: + batch_sizes = [1, 2, 4, 8] + + do_inferences(scfg.copy(output_dir=FLAGS.save_dir / str(FLAGS.num_replicas)), + [name] * FLAGS.num_replicas, + batch_sizes) + + +def diff(argv): + # type: (Sequence[str]) -> None + scfg = maybe_forced_preset(presets.MostEfficient) + scfg.logconf = 'disable' + + # all non-integer argv are treated as names + names = [] + batch_sizes = [] + for arg in argv: + try: + batch_sizes.append(int(arg)) + except ValueError: + names.append(arg) + + # create jobs + batch_num = 100 + # batch_sizes = [1, 2, 4, 8, 16, 32] + # batch_sizes = [1024, 1536, 2048, 4096] + for bs in batch_sizes: + with tempfile.TemporaryDirectory() as td: + wls = [] + pipes = [] + for name in names: + if not name.endswith('eval'): + raise ValueError('Not an inference workload!!!') + wl = WTL.create(name, bs, batch_num, executor=Executor.Salus) + set_env(wl) + wls.append(wl) + + # also add a small pause to make sure every job starts + pipe = str(pathlib.Path(td).joinpath(wl.canonical_name).with_suffix('.pipe')) + os.mkfifo(pipe) + pipes.append(pipes) + + # wait all jobs to be ready + wls.append(RunFn(lambda workloads, **kwargs: [wait_on_pipe(pipe) for pipe in pipes] and None)) + # signal all jobs to start + wls.append(RunFn(lambda workloads, **kwargs: [release_on_pipe(pipe) for pipe in pipes] and None)) + + run_seq(scfg.copy(output_dir=FLAGS.save_dir / '-'.join(names)), + *wls) + + +@case_switch_main +def main(): + return same, diff diff --git a/src/oplibraries/tensorflow/tfinstance.cpp b/src/oplibraries/tensorflow/tfinstance.cpp index 1b667eb..7e78d63 100644 --- a/src/oplibraries/tensorflow/tfinstance.cpp +++ b/src/oplibraries/tensorflow/tfinstance.cpp @@ -140,7 +140,13 @@ void TFInstance::handleCreateSession(std::unique_ptr & static_cast(std::round(sstl::getOrDefault(m.persistant(), "TIME:TOTAL", 0.0))) * 1000; ectx->setExpectedRunningTime(totalRunningTime); - m_laneMgr->requestLanes(std::move(layout), [&resp, cb = std::move(cb), req = std::move(req), ectx = std::move(ectx), + // smaller is higher priority + auto priority = static_cast(sstl::getOrDefault(m.persistant(), "SCHED:PRIORITY", 20)); + + LOG(INFO) << "Accept session with priority " << priority; + + m_laneMgr->requestLanes(std::move(layout), [&resp, priority, + cb = std::move(cb), req = std::move(req), ectx = std::move(ectx), this](auto &&lanes) mutable { std::vector devices; @@ -174,7 +180,7 @@ void TFInstance::handleCreateSession(std::unique_ptr & }); // Keep a reference for lanes on ectx's user data // which should outlive the TFSession. - ectx->setUserData(std::forward(lanes)); + ectx->setUserData(TFExecutionCtxData{std::forward(lanes), priority}); // Register force interrupt handler ectx->setInterruptCallback([this, handle]() { popSession(handle)->safeClose(); }); diff --git a/src/oplibraries/tensorflow/tfutils.h b/src/oplibraries/tensorflow/tfutils.h index b36f103..0b6c5ef 100644 --- a/src/oplibraries/tensorflow/tfutils.h +++ b/src/oplibraries/tensorflow/tfutils.h @@ -25,6 +25,7 @@ #include #include #include +#include #define CallWithMasterMethodName(m) \ m(CreateSession) m(ExtendSession) m(PartialRunSetup) m(CloseSession) m(ListDevices) m(Reset) m(RunStep) @@ -72,6 +73,13 @@ using POpKernel = std::unique_ptr; std::string tfGraphToGraphviz(const tf::Graph &g, const std::string &name); +class LaneHolder; +struct TFExecutionCtxData +{ + std::vector> lanes; + int priority; +}; + } // namespace salus::oplib::tensorflow #endif // SALUS_OPLIB_TENSORFLOW_TFUTILS_H diff --git a/src/oplibraries/tensorflow/v3/smblocker.cpp b/src/oplibraries/tensorflow/v3/smblocker.cpp index b99e36f..4220473 100644 --- a/src/oplibraries/tensorflow/v3/smblocker.cpp +++ b/src/oplibraries/tensorflow/v3/smblocker.cpp @@ -69,9 +69,11 @@ void salus_kernel_launch_callback(unsigned int gridDimX, unsigned int gridDimY, {blockDimX, blockDimY, blockDimZ}, sharedMemBytes, }); - LOG(DEBUG) << "Got kernel launch params: blk=(" - << gridDimX << "," << gridDimY << "," << gridDimZ - << ") x thd=(" << blockDimX << "," << blockDimY << "," << blockDimZ << "), " << sharedMemBytes; + if (false) { + LOG(DEBUG) << "Got kernel launch params: blk=(" + << gridDimX << "," << gridDimY << "," << gridDimZ + << ") x thd=(" << blockDimX << "," << blockDimY << "," << blockDimZ << "), " << sharedMemBytes; + } } } // extern "C" @@ -103,9 +105,11 @@ SMBlocker::SMBlocker() void SMBlocker::saveCurrentThreadResults(uint64_t graphId, int nodeId) { + LOG(DEBUG) << "Release at SMBlocker: graph " << graphId << " node " << nodeId + << " sm " << CurrentThreadHoldingBlocks; // release blocks first { - m_freeBlocks.notify(CurrentThreadHoldingBlocks); + m_freeBlocks.post(CurrentThreadHoldingBlocks); CurrentThreadHoldingBlocks = 0; } @@ -129,23 +133,32 @@ void SMBlocker::saveCurrentThreadResults(uint64_t graphId, int nodeId) SavedCudaKernelLaunches.clear(); } -bool SMBlocker::maybeBlock(uint64_t graphId, int nodeId) +bool SMBlocker::tryTake(uint64_t graphId, int nodeId, int priority) { auto smUsage = getUsageForKernel(graphId, nodeId); - return m_freeBlocks.may_block(smUsage); + auto res = m_freeBlocks.try_wait(smUsage, priority); + if (res) { + // save the count + CurrentThreadHoldingBlocks = smUsage; + LOG(DEBUG) << "Passed at SMBlocker: graph " << graphId << " node " << nodeId + << " sm " << smUsage << " priority " << priority; + } + return res; } -void SMBlocker::wait(uint64_t graphId, int nodeId) +void SMBlocker::wait(uint64_t graphId, int nodeId, int priority) { auto smUsage = getUsageForKernel(graphId, nodeId); // save the count CurrentThreadHoldingBlocks = smUsage; - LOG(DEBUG) << "Wait at SMBlocker: graph " << graphId << " node " << nodeId; - m_freeBlocks.wait(smUsage); - LOG(DEBUG) << "Passed at SMBlocker: graph " << graphId << " node " << nodeId; + LOG(DEBUG) << "Wait at SMBlocker: graph " << graphId << " node " << nodeId + << " sm " << smUsage << " priority " << priority; + m_freeBlocks.wait(smUsage, priority); + LOG(DEBUG) << "Passed at SMBlocker: graph " << graphId << " node " << nodeId + << " sm " << smUsage << " priority " << priority; } uint64_t SMBlocker::getUsageForKernel(uint64_t graphId, int nodeId) diff --git a/src/oplibraries/tensorflow/v3/smblocker.h b/src/oplibraries/tensorflow/v3/smblocker.h index dca7bfc..4ef2ed3 100644 --- a/src/oplibraries/tensorflow/v3/smblocker.h +++ b/src/oplibraries/tensorflow/v3/smblocker.h @@ -59,8 +59,24 @@ class SMBlocker void saveCurrentThreadResults(uint64_t graphId, int nodeId); - bool maybeBlock(uint64_t graphId, int nodeId); - void wait(uint64_t graphId, int nodeId); + /** + * @brief Non-blocking version of wait + * @param graphId + * @param nodeId + * @param priority Smaller priority is higher, default is 10 + * @return true if successfully get needed resource + */ + bool tryTake(uint64_t graphId, int nodeId, int priority); + + /** + * @brief Blocking wait + * @param graphId + * @param nodeId + * @param priority + */ + void wait(uint64_t graphId, int nodeId, int priority); + + static constexpr int MaxPriority = 100; private: static SMUsage queryAvailableSM(); @@ -71,7 +87,7 @@ class SMBlocker const SMUsage m_maxUsage; - sstl::semaphore m_freeBlocks; + sstl::priority_semaphore m_freeBlocks; using KernelId = std::pair; std::unordered_map> m_cache; diff --git a/src/oplibraries/tensorflow/v3/tf_executor.cpp b/src/oplibraries/tensorflow/v3/tf_executor.cpp index cf0fcea..1adc249 100644 --- a/src/oplibraries/tensorflow/v3/tf_executor.cpp +++ b/src/oplibraries/tensorflow/v3/tf_executor.cpp @@ -1104,6 +1104,11 @@ class ExecutorState return ready_.end(); } + auto size() const + { + return ready_.size(); + } + private: tf::gtl::InlinedVector ready_; size_t front_index_; @@ -1482,14 +1487,21 @@ void ExecutorState::Process(TaggedNode tagged_node, tf::int64) Status s; EntryVector outputs; bool completed = false; + uint64_t failedTake = 0; + auto priority = std::any_cast(impl_->params_.ins->userData()).priority; inline_ready.push_back(tagged_node); while (!inline_ready.empty()) { tagged_node = inline_ready.front(); - if (SMBlocker::instance().maybeBlock(impl_->graph_id_, tagged_node.node->id())) { - continue; + if (!SMBlocker::instance().tryTake(impl_->graph_id_, tagged_node.node->id(), priority)) { + ++failedTake; + if (failedTake < inline_ready.size()) { + continue; + } else { + SMBlocker::instance().wait(impl_->graph_id_, tagged_node.node->id(), priority); + failedTake = 0; + } } - SMBlocker::instance().wait(impl_->graph_id_, tagged_node.node->id()); inline_ready.pop_front(); diff --git a/src/rpcserver/zmqserver.cpp b/src/rpcserver/zmqserver.cpp index d4fd3fc..8626cfe 100644 --- a/src/rpcserver/zmqserver.cpp +++ b/src/rpcserver/zmqserver.cpp @@ -22,6 +22,7 @@ #include "rpcservercore.h" #include "platform/logging.h" #include "platform/signals.h" +#include "platform/thread_annotations.h" #include "utils/protoutils.h" #include "protos.h" @@ -292,6 +293,8 @@ void ZmqServer::sendMessage(MultiPartMessage &&parts) void ZmqServer::sendLoop() { + salus::threading::set_thread_name("ZmqSendLoop"); + zmq::socket_t sock(m_zmqCtx, zmq::socket_type::pair); sock.connect(kBeAddr); VLOG(2) << "Sending loop started"; diff --git a/src/utils/threadutils.h b/src/utils/threadutils.h index c267220..c7d854d 100644 --- a/src/utils/threadutils.h +++ b/src/utils/threadutils.h @@ -176,6 +176,80 @@ class semaphore bool may_block(uint64_t c = 1); }; +/** + * @brief Semaphore that can wait on count and with strict priority. + * As long as higher priority queue is not empty, lower priority reqeust will wait. + */ +template +class priority_semaphore +{ + std::mutex m_mu; + uint64_t m_pending[kMaxPriority]{}; + std::condition_variable m_queues[kMaxPriority]; + uint64_t m_count; + +public: + static_assert(kMaxPriority > 0, "Max priority must be greater than 0"); + static_assert(kDefaultPriority < kMaxPriority, "Default priority must be in the range [0, kMaxPriority)"); + + explicit priority_semaphore(uint64_t init = 0) : m_count(init) {} + + void post(uint64_t c = 1) + { + auto l = with_guard(m_mu); + m_count += c; + for (auto p = 0; p != kMaxPriority; ++p) { + if (m_pending[p] > 0) { + m_queues[p].notify_all(); + break; + } + } + } + + void wait(uint64_t c = 1, uint8_t p = kDefaultPriority) + { + auto lock = with_uguard(m_mu); + if (can_take(c, p)) { + m_count -= c; + return; + } + m_pending[p] += 1; + m_queues[p].wait(lock, [&]() { return can_take(c, p); }); + m_pending[p] -= 1; + m_count -= c; + } + + bool try_wait(uint64_t c = 1, uint8_t p = kDefaultPriority) + { + auto lock = with_guard(m_mu); + if (can_take(c, p)) { + m_count -= c; + return true; + } + return false; + } + +private: + /** + * @brief Must be called under lock of m_mu + * @param c + * @param p + * @return true if can take the resource at this p level + */ + bool can_take(uint64_t c, uint8_t p) + { + for (auto i = 0; i != p; ++i) { + if (m_pending[i] > 0) { + return false; + } + } + // whether to skip current priority level's queue if it's available? + // yes. because, when waken up in cv's wait, m_pending is not subtracted yet, + // but we still need to proceed + return m_count >= c; + } +}; + /** * Notification that is sticky. */ diff --git a/tests/test_tf/lib/tfhelper.py b/tests/test_tf/lib/tfhelper.py index 08256ab..34838b2 100644 --- a/tests/test_tf/lib/tfhelper.py +++ b/tests/test_tf/lib/tfhelper.py @@ -9,6 +9,14 @@ import tensorflow as tf +def batch_size_from_env(default=1): + """Get batch size from environment variable SALUS_BATCH_SIZE""" + try: + return int(os.environ.get('SALUS_BATCH_SIZE', '')) + except ValueError: + return default + + def iteration_num_from_env(default=20): """Get iteration number from environment variable EXEC_ITER_NUMBER""" try: diff --git a/tests/test_tf/test_super_res.py b/tests/test_tf/test_super_res.py index 075c276..fff5fa2 100644 --- a/tests/test_tf/test_super_res.py +++ b/tests/test_tf/test_super_res.py @@ -19,6 +19,8 @@ def run_superres(sess, input_data, batch_size=100, isEval=False): + batch_size = tfhelper.batch_size_from_env(batch_size) + input_images, target_images = input_data(batch_size=batch_size) model = networks.SuperRes(input_images, target_images, batch_size=batch_size) @@ -57,7 +59,7 @@ def run_superres(sess, input_data, batch_size=100, isEval=False): print(fmt_str.format(datetime.now(), i, loss_value, examples_per_sec, sec_per_batch)) losses.append(loss_value) - if isEval: + if isEval and eval_rand_factor != '0': factor = 1 if eval_rand_factor != "1": factor = random.randint(1, int(eval_rand_factor)) diff --git a/tests/test_tf/test_vae.py b/tests/test_tf/test_vae.py index 59085e6..9e867e6 100644 --- a/tests/test_tf/test_vae.py +++ b/tests/test_tf/test_vae.py @@ -26,8 +26,11 @@ def run_vae(sess, args=None, isEval=False): if args is None: args = networks.vae.get_args() + batch_size = tfhelper.batch_size_from_env(args.batch_size) + print(f"Batch size: {batch_size}") + dim_img = IMAGE_SIZE_MNIST ** 2 # number of pixels for a MNIST image - x_image, _, num_classes = fake_data(args.batch_size, None, height=IMAGE_SIZE_MNIST, width=IMAGE_SIZE_MNIST, + x_image, _, num_classes = fake_data(batch_size, None, height=IMAGE_SIZE_MNIST, width=IMAGE_SIZE_MNIST, depth=1, num_classes=10) with tf.name_scope('model'): @@ -78,7 +81,7 @@ def run_vae(sess, args=None, isEval=False): last_end_time = end_time duration = end_time - start_time - examples_per_sec = args.batch_size / duration + examples_per_sec = batch_size / duration sec_per_batch = float(duration) speeds.append(sec_per_batch) @@ -87,7 +90,7 @@ def run_vae(sess, args=None, isEval=False): print(fmt_str.format(datetime.now(), i, loss_value, examples_per_sec, sec_per_batch)) - if isEval: + if isEval and eval_rand_factor != '0': factor = 1 if eval_rand_factor != "1": factor = random.randint(1, int(eval_rand_factor)) From d752c190ecdf214742026a91ed3cb7f8a8795cca Mon Sep 17 00:00:00 2001 From: Peifeng Yu Date: Tue, 23 Apr 2019 23:37:36 -0400 Subject: [PATCH 10/13] Proper SM tracking --- bc | 2 +- benchmarks/driver/runner.py | 8 + benchmarks/driver/utils/prompt.py | 5 +- benchmarks/driver/workload.py | 7 +- benchmarks/exps/__init__.py | 13 +- benchmarks/exps/card304.py | 173 ++++++++++++++++- src/CMakeLists.txt | 38 ++-- .../threadpool/nonblockingthreadpool.cpp | 6 +- src/execution/threadpool/threadpool.h | 29 +++ src/oplibraries/tensorflow/device/gpu/gpu.cpp | 23 +++ src/oplibraries/tensorflow/device/gpu/gpu.h | 6 + .../tensorflow/device/gpu/smeventpoller.cpp | 178 ++++++++++++++++++ .../tensorflow/device/gpu/smeventpoller.h | 100 ++++++++++ .../tensorflow/device/shadowdevices.h | 2 + src/oplibraries/tensorflow/v3/smblocker.cpp | 37 ++-- src/oplibraries/tensorflow/v3/smblocker.h | 18 +- src/platform/logging.cpp | 2 +- src/platform/logging.h | 2 + src/wrapper/CMakeLists.txt | 2 + src/wrapper/salus-server.in | 30 +++ 20 files changed, 639 insertions(+), 42 deletions(-) create mode 100644 src/oplibraries/tensorflow/device/gpu/smeventpoller.cpp create mode 100644 src/oplibraries/tensorflow/device/gpu/smeventpoller.h create mode 100644 src/wrapper/CMakeLists.txt create mode 100755 src/wrapper/salus-server.in diff --git a/bc b/bc index 710dff7..6bdef04 100755 --- a/bc +++ b/bc @@ -1,2 +1,2 @@ #! /bin/bash -python -m benchmarks.driver "$@" +vex tfbuild python -m benchmarks.driver "$@" diff --git a/benchmarks/driver/runner.py b/benchmarks/driver/runner.py index 42482d1..36c8ab3 100644 --- a/benchmarks/driver/runner.py +++ b/benchmarks/driver/runner.py @@ -164,6 +164,8 @@ def __call__(self, executor, output_file): '--model_dir=' + eval_model_dir, ] + cmd += self.wl.extra_args + if FLAGS.no_capture: return execute(cmd, cwd=str(cwd), env=self.env) else: @@ -196,6 +198,7 @@ def __call__(self, executor, output_file): 'stdbuf', '-o0', '-e0', '--', 'python', '-m', pkg, method, ] + cmd += self.wl.extra_args if FLAGS.no_capture: return execute(cmd, cwd=str(cwd), env=self.env) else: @@ -303,6 +306,8 @@ def __call__(self, executor, output_file): else: raise ValueError(f'Unknown executor: {executor}') + cmd += self.wl.extra_args + if FLAGS.no_capture: return execute(cmd, cwd=str(cwd), env=self.env) else: @@ -345,6 +350,7 @@ def __call__(self, executor, output_file): ] else: raise ValueError(f'Unknown executor: {executor}') + cmd += self.wl.extra_args if FLAGS.no_capture: return execute(cmd, cwd=str(cwd), env=self.env) @@ -398,6 +404,7 @@ def __call__(self, executor, output_file): cmd += [ '--num_replicas', num_replicas ] + cmd += self.wl.extra_args if FLAGS.no_capture: return execute(cmd, cwd=str(cwd), env=self.env) @@ -435,6 +442,7 @@ def __call__(self, executor, output_file): # always write plan to stdin '-', ] + cmd += self.wl.extra_args proc = execute(cmd, cwd=str(cwd), env=self.env, stdin=sp.PIPE) proc.stdin.write(self._plan_to_bytes()) diff --git a/benchmarks/driver/utils/prompt.py b/benchmarks/driver/utils/prompt.py index 155bd3d..0d706db 100644 --- a/benchmarks/driver/utils/prompt.py +++ b/benchmarks/driver/utils/prompt.py @@ -93,4 +93,7 @@ def pause(prompt='Press enter to continue...'): """Pause the execution and wait the user to press enter""" # we don't want to guard against KeyboardInterrupt - input(prompt) + try: + input(prompt) + except EOFError: + pass diff --git a/benchmarks/driver/workload.py b/benchmarks/driver/workload.py index 93f9489..0485672 100644 --- a/benchmarks/driver/workload.py +++ b/benchmarks/driver/workload.py @@ -25,7 +25,7 @@ import csv import logging from collections import defaultdict -from typing import Dict, Iterable, Type, Union +from typing import Dict, Iterable, Type, Union, Optional from .runner import Runner, RunConfig, Popen, Executor from .runner import TFBenchmarkRunner, UnittestRunner, FathomRunner, TFWebRunner, TFWebClientRunner @@ -357,8 +357,9 @@ def __init__(self, wtl, rcfg, executor, geo): self.rcfg = rcfg self.executor = executor self._geo = geo - self.proc = None # type: Popen - self.output_file = None # type: Path + self.proc = None # type: Optional[Popen] + self.output_file = None # type: Optional[Path] + self.extra_args = [] @property def name(self): diff --git a/benchmarks/exps/__init__.py b/benchmarks/exps/__init__.py index 41187ab..9262e05 100644 --- a/benchmarks/exps/__init__.py +++ b/benchmarks/exps/__init__.py @@ -42,6 +42,7 @@ FLAGS = flags.FLAGS flags.DEFINE_boolean('ignore_error', False, 'Ignore error on workload') +flags.DEFINE_string('logconf', None, 'Override default logconf in preset') class Pause(int): @@ -236,11 +237,15 @@ def parse_actions_from_cmd(argv): def maybe_forced_preset(default): # type: (Callable[[], SalusConfig]) -> SalusConfig """Maybe return forced preset""" + preset_ctor = default if FLAGS.force_preset: - logger.info(f'Using server config preset: {FLAGS.force_preset}') - return getattr(presets, FLAGS.force_preset)() - logger.info(f'Using server config preset: {default.__name__}') - return default() + preset_ctor = getattr(presets, FLAGS.force_preset) + logger.info(f'Using server config preset: {preset_ctor.__name__}') + scfg = preset_ctor() + if FLAGS.logconf is not None: + logger.info(f'Using server logconf: {FLAGS.logconf}') + scfg.logconf = FLAGS.logconf + return scfg def parse_output_float(outputfile, pattern, group=1): diff --git a/benchmarks/exps/card304.py b/benchmarks/exps/card304.py index bd193ef..8c5b872 100644 --- a/benchmarks/exps/card304.py +++ b/benchmarks/exps/card304.py @@ -38,7 +38,11 @@ from benchmarks.driver.server.config import presets from benchmarks.driver.workload import WTL, Executor from benchmarks.driver.utils.compatiblity import pathlib -from benchmarks.exps import run_seq, maybe_forced_preset, RunFn, sync_on_pipe, Pause, wait_on_pipe, release_on_pipe +from benchmarks.exps import ( + run_seq, maybe_forced_preset, RunFn, Pause, wait_on_pipe, release_on_pipe, + case_switch_main, + run_tfdist, run_tf +) FLAGS = flags.FLAGS @@ -55,10 +59,9 @@ def set_env(wl): wl.env['SALUS_TFBENCH_EVAL_MODEL_DIR'] = model_dir -def main(argv): +def salus(argv): # type: (Sequence[str]) -> None scfg = maybe_forced_preset(presets.MostEfficient) - scfg.logconf = 'disable' name = "alexneteval" if len(argv) > 1: @@ -93,7 +96,7 @@ def main(argv): os.mkfifo(pipe) wl.env['SALUS_WAIT_FOR_SIGNAL'] = pipe - run_seq(scfg.copy(output_dir=FLAGS.save_dir / (name + "-inception4")), + run_seq(scfg.copy(output_dir=FLAGS.save_dir / "salus" / (name + "-inception4")), trainWl, # start the background job wl, # start the foreground job # wait for both jobs to be ready @@ -108,3 +111,165 @@ def main(argv): # run_seq automatically join all jobs at the end of the sequence ) + +def tfdist(argv): + # type: (Sequence[str]) -> None + name = "alexneteval" + if len(argv) > 1: + name = argv[0] + batch_sizes = [int(v) for v in argv[1:]] + + if not batch_sizes: + batch_sizes = [1, 2, 4, 8] + + batch_num = 300 + # batch_sizes = [1, 2, 4, 8, 16, 32] + # batch_sizes = [1024, 1536, 2048, 4096] + for idx, bs in enumerate(batch_sizes): + with tempfile.TemporaryDirectory() as td: + # create a background training job, the batch number has no effect here, + # only used to distinguish different runs + trainWl = WTL.create('inception4', 50, 100 + idx, executor=Executor.TFDist) + # make sure it runs long enough + trainWl.env['SALUS_ITER_SECONDS'] = '300' + + # create a pipe to signal trainWl + pipetrain = str(pathlib.Path(td).joinpath('fifotrain')) + os.mkfifo(pipetrain) + trainWl.env['SALUS_WAIT_FOR_SIGNAL'] = pipetrain + + # create the foreground inference job + wl = WTL.create(name, bs, batch_num, executor=Executor.TFDist) + set_env(wl) + wl.env['SALUS_ITER_SECONDS'] = '150' + + pipe = str(pathlib.Path(td).joinpath('fifo')) + os.mkfifo(pipe) + wl.env['SALUS_WAIT_FOR_SIGNAL'] = pipe + + run_tfdist(FLAGS.save_dir / "tfdist" / (name + "-inception4"), + trainWl, # start the background job + wl, # start the foreground job + # wait for both jobs to be ready + RunFn(lambda *args, **kwargs: wait_on_pipe(pipetrain)), + RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)), + # start train job + RunFn(lambda *args, **kwargs: release_on_pipe(pipetrain)), + # wait 10 seconds + Pause(10), + # release inference job + RunFn(lambda *args, **kwargs: release_on_pipe(pipe)), + # run_seq automatically join all jobs at the end of the sequence + ) + + +def tfmps(argv): + # type: (Sequence[str]) -> None + name = "alexneteval" + if len(argv) > 1: + name = argv[0] + batch_sizes = [int(v) for v in argv[1:]] + + if not batch_sizes: + batch_sizes = [1, 2, 4, 8] + + batch_num = 300 + # batch_sizes = [1, 2, 4, 8, 16, 32] + # batch_sizes = [1024, 1536, 2048, 4096] + for idx, bs in enumerate(batch_sizes): + with tempfile.TemporaryDirectory() as td: + # create a background training job, the batch number has no effect here, + # only used to distinguish different runs + trainWl = WTL.create('inception4', 50, 100 + idx, executor=Executor.TF) + # make sure it runs long enough + trainWl.env['SALUS_ITER_SECONDS'] = '300' + trainWl.extra_args += ['--min_mem'] + + # create a pipe to signal trainWl + pipetrain = str(pathlib.Path(td).joinpath('fifotrain')) + os.mkfifo(pipetrain) + trainWl.env['SALUS_WAIT_FOR_SIGNAL'] = pipetrain + + # create the foreground inference job + wl = WTL.create(name, bs, batch_num, executor=Executor.TF) + set_env(wl) + wl.env['SALUS_ITER_SECONDS'] = '150' + wl.extra_args += ['--min_mem'] + + pipe = str(pathlib.Path(td).joinpath('fifo')) + os.mkfifo(pipe) + wl.env['SALUS_WAIT_FOR_SIGNAL'] = pipe + + run_tf(FLAGS.save_dir / "tfmps" / (name + "-inception4"), + trainWl, # start the background job + wl, # start the foreground job + # wait for both jobs to be ready + RunFn(lambda *args, **kwargs: wait_on_pipe(pipetrain)), + RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)), + # start train job + RunFn(lambda *args, **kwargs: release_on_pipe(pipetrain)), + # wait 10 seconds + Pause(10), + # release inference job + RunFn(lambda *args, **kwargs: release_on_pipe(pipe)), + # run_seq automatically join all jobs at the end of the sequence + ) + + +def tfmps2(argv): + # type: (Sequence[str]) -> None + name = "alexneteval" + if len(argv) > 1: + name = argv[0] + batch_sizes = [int(v) for v in argv[1:]] + + if not batch_sizes: + batch_sizes = [1, 2, 4, 8] + + batch_num = 300 + # batch_sizes = [1, 2, 4, 8, 16, 32] + # batch_sizes = [1024, 1536, 2048, 4096] + for idx, bs in enumerate(batch_sizes): + with tempfile.TemporaryDirectory() as td: + # create a background training job, the batch number has no effect here, + # only used to distinguish different runs + trainWl = WTL.create('inception4', 50, 100 + idx, executor=Executor.TF) + # make sure it runs long enough + trainWl.env['SALUS_ITER_SECONDS'] = '300' + trainWl.extra_args += ['--min_mem'] + + # create a pipe to signal trainWl + pipetrain = str(pathlib.Path(td).joinpath('fifotrain')) + os.mkfifo(pipetrain) + trainWl.env['SALUS_WAIT_FOR_SIGNAL'] = pipetrain + + # create the foreground inference job + wl = WTL.create(name, bs, batch_num, executor=Executor.TF) + set_env(wl) + wl.env['SALUS_ITER_SECONDS'] = '150' + wl.extra_args += ['--min_mem'] + + pipe = str(pathlib.Path(td).joinpath('fifo')) + os.mkfifo(pipe) + wl.env['SALUS_WAIT_FOR_SIGNAL'] = pipe + + run_tf(FLAGS.save_dir / "tfmps2" / (name + "-inception4"), + wl, # start the foreground job + Pause(20), + trainWl, # start the background job + # wait for both jobs to be ready + RunFn(lambda *args, **kwargs: wait_on_pipe(pipetrain)), + RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)), + # start train job + RunFn(lambda *args, **kwargs: release_on_pipe(pipetrain)), + # wait 10 seconds + Pause(10), + # release inference job + RunFn(lambda *args, **kwargs: release_on_pipe(pipe)), + # run_seq automatically join all jobs at the end of the sequence + ) + + +@case_switch_main +def main(): + return salus, tfdist, tfmps, tfmps2 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4fa7123..df96701 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -68,14 +68,15 @@ if(USE_TENSORFLOW) "oplibraries/tensorflow/device/salusdevices.cpp" "oplibraries/tensorflow/device/cpu.cpp" "oplibraries/tensorflow/device/gpu/gpu.cpp" + "oplibraries/tensorflow/device/gpu/smeventpoller.cpp" "oplibraries/tensorflow/device/gpu/lane/lanemgr.cpp" "oplibraries/tensorflow/device/gpu/sessiondevice.cpp" "oplibraries/tensorflow/device/sessionallocator.cpp" ) endif(USE_TENSORFLOW) -add_executable(salus-server ${SRC_LIST}) -target_link_libraries(salus-server +add_executable(salus-server-exec ${SRC_LIST}) +target_link_libraries(salus-server-exec protos_gen platform @@ -88,20 +89,20 @@ target_link_libraries(salus-server ) if(USE_TENSORFLOW) - target_link_libraries(salus-server + target_link_libraries(salus-server-exec tensorflow::kernels ) - target_compile_definitions(salus-server + target_compile_definitions(salus-server-exec PRIVATE GOOGLE_CUDA=1 ) endif(USE_TENSORFLOW) -target_link_options(salus-server +target_link_options(salus-server-exec PRIVATE "LINKER:--dynamic-list=${CMAKE_CURRENT_SOURCE_DIR}/salus-server.list" ) -set_target_properties(salus-server PROPERTIES +set_target_properties(salus-server-exec PROPERTIES LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/salus-server.list ) @@ -109,11 +110,11 @@ set_target_properties(salus-server PROPERTIES # Instrucment #--------------------------------------------------------------------------------------- if(WITH_GPERFTOOLS AND WITH_TCMALLOC) - target_link_libraries(salus-server gperftools::tcmalloc_and_profiler) + target_link_libraries(salus-server-exec gperftools::tcmalloc_and_profiler) elseif(WITH_GPERFTOOLS) - target_link_libraries(salus-server gperftools::profiler) + target_link_libraries(salus-server-exec gperftools::profiler) elseif(WITH_TCMALLOC) - target_link_libraries(salus-server gperftools::tcmalloc) + target_link_libraries(salus-server-exec gperftools::tcmalloc) endif() #--------------------------------------------------------------------------------------- @@ -121,17 +122,32 @@ endif() #--------------------------------------------------------------------------------------- add_subdirectory(cudahook) +#--------------------------------------------------------------------------------------- +# Exec Wrapper +#--------------------------------------------------------------------------------------- +add_subdirectory(wrapper) +file(COPY ${CMAKE_CURRENT_BINARY_DIR}/wrapper/salus-server + DESTINATION ${CMAKE_CURRENT_BINARY_DIR} + FILE_PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE + GROUP_READ GROUP_WRITE GROUP_EXECUTE + WORLD_EXECUTE +) +install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/salus-server + DESTINATION ${CMAKE_INSTALL_PREFIX}/bin +) + #--------------------------------------------------------------------------------------- # Installation #--------------------------------------------------------------------------------------- -#set_target_properties(salus-server PROPERTIES +#set_target_properties(salus-server-exec PROPERTIES # INSTALL_RPATH "${SALUS_DEPS_PATH}/lib" #) -install(TARGETS salus-server +install(TARGETS salus-server-exec RUNTIME DESTINATION bin ) + # also install dependencies in spack-packages #if (DEFINED SALUS_DEPS_PATH) # if (EXISTS ${SALUS_DEPS_PATH}/lib) diff --git a/src/execution/threadpool/nonblockingthreadpool.cpp b/src/execution/threadpool/nonblockingthreadpool.cpp index 57143ea..39a24b7 100644 --- a/src/execution/threadpool/nonblockingthreadpool.cpp +++ b/src/execution/threadpool/nonblockingthreadpool.cpp @@ -330,7 +330,11 @@ int ThreadPoolPrivate::nonEmptyQueueIndex() void ThreadPoolPrivate::workerLoop(int thread_id) { - salus::threading::set_thread_name("ThreadPoolWorker"); + if (m_options.workerName.empty()) { + salus::threading::set_thread_name("ThreadPoolWorker"); + } else { + salus::threading::set_thread_name(m_options.workerName); + } const auto numThreads = m_options.numThreads; const auto spinCount = m_options.spinCount; diff --git a/src/execution/threadpool/threadpool.h b/src/execution/threadpool/threadpool.h index 7f358f0..c22143f 100644 --- a/src/execution/threadpool/threadpool.h +++ b/src/execution/threadpool/threadpool.h @@ -33,17 +33,46 @@ struct ThreadPoolOptions */ size_t numThreads = 0; + ThreadPoolOptions &setNumThreads(size_t num) + { + numThreads = num; + return *this; + } + /** * Whether allow spinning wait in worker threads for lower latency */ bool allowSpinning = true; + ThreadPoolOptions &setAllowSpinning(bool allow) + { + allowSpinning = allow; + return *this; + } + /** * Times of tries for spin wait before go to wait. * Use -1 for default value, which is 5000 / numThreads */ int spinCount = -1; + ThreadPoolOptions &setSpinCount(int count) + { + spinCount = count; + return *this; + } + + /** + * @brief Optional worker thread name, truncated at 16 characters. + */ + std::string workerName = ""; + + ThreadPoolOptions &setWorkerName(const std::string &name) + { + workerName = name; + return *this; + } + ThreadPoolOptions(); ThreadPoolOptions(const ThreadPoolOptions &) = default; ThreadPoolOptions(ThreadPoolOptions &&) = default; diff --git a/src/oplibraries/tensorflow/device/gpu/gpu.cpp b/src/oplibraries/tensorflow/device/gpu/gpu.cpp index e5d54d6..465c552 100644 --- a/src/oplibraries/tensorflow/device/gpu/gpu.cpp +++ b/src/oplibraries/tensorflow/device/gpu/gpu.cpp @@ -23,6 +23,7 @@ #include "execution/engine/resourcecontext.h" #include "oplibraries/tensorflow/device/gpu/sessiondevice.h" +#include "oplibraries/tensorflow/v3/smblocker.h" #include "utils/threadutils.h" #include @@ -37,7 +38,11 @@ SalusGPUDevice::SalusGPUDevice(const tf::SessionOptions &options, const std::str false /* sync every op */, max_streams) , m_streamUsed(static_cast(max_streams), false) , m_cudaHostAlloc(cuda_host_alloc) + , m_SMPoller(nullptr) { + auto executor_status = tf::GPUMachineManager()->ExecutorForDevice(gpu_id); + + m_SMPoller = std::make_unique(executor_status.ValueOrDie()); } tf::Allocator *SalusGPUDevice::GetAllocator(tf::AllocatorAttributes attr) @@ -55,6 +60,24 @@ tf::Allocator *SalusGPUDevice::GetAllocator(tf::AllocatorAttributes attr) return gpu_allocator_; } +void SalusGPUDevice::Compute(tf::OpKernel *op_kernel, tf::OpKernelContext *context) +{ + BaseGPUDevice::Compute(op_kernel, context); + + m_SMPoller->thenReleaseSM(context->op_device_context()->stream(), + SMBlocker::instance().currentThreadSMHolding()); +} + +void SalusGPUDevice::ComputeAsync(tf::AsyncOpKernel *op_kernel, tf::OpKernelContext *context, + tf::AsyncOpKernel::DoneCallback done) +{ + BaseGPUDevice::ComputeAsync(op_kernel, context, [this, context, done = std::move(done)]() { + m_SMPoller->thenReleaseSM(context->op_device_context()->stream(), + SMBlocker::instance().currentThreadSMHolding()); + done(); + }); +} + Status SalusGPUDevice::Sync() { return BaseGPUDevice::Sync(); diff --git a/src/oplibraries/tensorflow/device/gpu/gpu.h b/src/oplibraries/tensorflow/device/gpu/gpu.h index ef930a8..3458d3f 100644 --- a/src/oplibraries/tensorflow/device/gpu/gpu.h +++ b/src/oplibraries/tensorflow/device/gpu/gpu.h @@ -22,6 +22,7 @@ #include "oplibraries/tensorflow/tensorflow_headers.h" #include "oplibraries/tensorflow/device/salusdevices.h" +#include "oplibraries/tensorflow/device/gpu/smeventpoller.h" #include "utils/objectpool.h" #include @@ -50,6 +51,10 @@ class SalusGPUDevice : public ISalusDevice, public tf::BaseGPUDevice Status FillContextMap(const tf::Graph *graph, std::vector *device_context_map) override; + void Compute(tf::OpKernel *op_kernel, tf::OpKernelContext *context) override; + void ComputeAsync(tf::AsyncOpKernel *op_kernel, tf::OpKernelContext *context, + tf::AsyncOpKernel::DoneCallback done) override; + void flushCacheFor(sstl::not_null graph) override; std::shared_ptr createPerTaskDevice(sstl::not_null graph, @@ -99,6 +104,7 @@ class SalusGPUDevice : public ISalusDevice, public tf::BaseGPUDevice std::mutex m_muStream; std::vector m_streamUsed; tf::Allocator *m_cudaHostAlloc; + std::unique_ptr m_SMPoller; }; class SalusGPUDeviceFactory : public tf::BaseGPUDeviceFactory diff --git a/src/oplibraries/tensorflow/device/gpu/smeventpoller.cpp b/src/oplibraries/tensorflow/device/gpu/smeventpoller.cpp new file mode 100644 index 0000000..c4a42d2 --- /dev/null +++ b/src/oplibraries/tensorflow/device/gpu/smeventpoller.cpp @@ -0,0 +1,178 @@ +/* + * Copyright 2019 Peifeng Yu + * + * This file is part of Salus + * (see https://github.com/SymbioticLab/Salus). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "smeventpoller.h" + +#include "oplibraries/tensorflow/v3/smblocker.h" +#include "platform/thread_annotations.h" + +namespace salus::oplib::tensorflow { + +namespace { + +} // namespace + +SMEventPoller::SMEventPoller(tf::gpu::StreamExecutor *se) + : m_pool(ThreadPoolOptions{} + .setWorkerName("SMEvtWorker") + // one thread for poller, one thread for executing callbacks + .setNumThreads(2)) + , m_se(se) +{ + startPollingLoop(); +} + +SMEventPoller::~SMEventPoller() +{ + stopPollingLoop(); + + // free anything owned by this + for (auto &act : m_pendingActions) { + SMBlocker::instance().release(act.count); + if (act.func) { + act.func(); + } + } +} + +void SMEventPoller::startPollingLoop() +{ + m_pool.run([this]() { + pollLoop(); + }); +} + +void SMEventPoller::stopPollingLoop() +{ + m_stopPolling.notify(); + // make sure to wake up polling loop thread + m_eventsStaging.notify(); + m_pollingStopped.wait(); +} + +void SMEventPoller::pollLoop() +{ + threading::set_thread_name("SMEvtPoller"); + // actions go from m_stagedEvents to staging, to waiting and finally to ready + while (!m_stopPolling.notified()) { + PendingActions staging; + { + auto g = sstl::with_guard(m_mu); + staging.swap(m_stagedEvents); + } + + m_pendingActions.insert(m_pendingActions.end(), + std::make_move_iterator(staging.begin()), + std::make_move_iterator(staging.end())); + + if (m_pendingActions.empty()) { + m_eventsStaging.wait(); + continue; + } + + auto ready = pollEvents(); + executeReady(ready); + } + m_pollingStopped.notify(); +} + +SMEventPoller::PendingActions SMEventPoller::pollEvents() +{ + if (VLOG_IS_ON(2)) { + size_t freeSize; + { + auto g = sstl::with_guard(m_mu); + freeSize = m_freeEvents.size(); + } + VLOG(2) << "SMEventPoller m_freeEvents " << freeSize << " m_pendingActions " << m_pendingActions.size(); + } + PendingActions ready; + auto it = m_pendingActions.begin(); + while (it != m_pendingActions.end()) { + auto &act = *it; + CHECK_NOTNULL(act.event); + auto s = act.event->PollForStatus(); + switch (s) { + default: + case tf::gpu::Event::Status::kUnknown: + case tf::gpu::Event::Status::kError: + // We don't expect to see these. Someday maybe propagate + // a Status error, but for now fail hard. + LOG(FATAL) << "Unexpected Event status: " << static_cast(s); + break; + case tf::gpu::Event::Status::kPending: + break; + case tf::gpu::Event::Status::kComplete: + // add event back to free event + { + auto g = sstl::with_guard(m_mu); + m_freeEvents.emplace_back(std::move(act.event)); + } + // add action to ready + ready.emplace_back(std::move(act)); + // remove from pending + it = m_pendingActions.erase(it); + // skip ++it + continue; + } + + ++it; + } + return ready; +} + +void SMEventPoller::executeReady(SMEventPoller::PendingActions &ready) +{ + for (auto &act : ready) { + SMBlocker::instance().release(act.count); + if (act.func) { + act.func(); + } + } +} + +void SMEventPoller::queueAction(tf::gpu::Stream *stream, PendingAction act) +{ + act.event = allocEvent(); + CHECK_NOTNULL(act.event); + stream->ThenRecordEvent(act.event.get()); + + { + auto g = sstl::with_guard(m_mu); + m_stagedEvents.emplace_back(std::move(act)); + } + // Wake up the polling thread + m_eventsStaging.notify(); +} + +std::unique_ptr SMEventPoller::allocEvent() +{ + auto g = sstl::with_guard(m_mu); + // Events are created on demand, and repeatedly reused. There is no + // limit placed here on the number of allocated Events. + if (m_freeEvents.empty()) { + m_freeEvents.emplace_back(std::make_unique(m_se)); + m_freeEvents.back()->Init(); + } + auto e = std::move(m_freeEvents.back()); + m_freeEvents.pop_back(); + return e; +} + +} // namespace salus::oplib::tensorflow diff --git a/src/oplibraries/tensorflow/device/gpu/smeventpoller.h b/src/oplibraries/tensorflow/device/gpu/smeventpoller.h new file mode 100644 index 0000000..1a51bb3 --- /dev/null +++ b/src/oplibraries/tensorflow/device/gpu/smeventpoller.h @@ -0,0 +1,100 @@ +/* + * Copyright 2019 Peifeng Yu + * + * This file is part of Salus + * (see https://github.com/SymbioticLab/Salus). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SALUS_OPLIB_TENSORFLOW_SMEVENTPOLLER_H +#define SALUS_OPLIB_TENSORFLOW_SMEVENTPOLLER_H + +#include "oplibraries/tensorflow/tensorflow_headers.h" + +#include "execution/threadpool/threadpool.h" +#include "utils/fixed_function.hpp" +#include "utils/threadutils.h" +#include "utils/pointerutils.h" + +#include +#include +#include + +namespace salus::oplib::tensorflow { + +class SMEventPoller +{ +public: + explicit SMEventPoller(tf::gpu::StreamExecutor *se); + ~SMEventPoller(); + + inline void thenReleaseSM(tf::gpu::Stream *stream, uint64_t count) + { + if (count == 0) { + return; + } + queueAction(stream, {count, {}, nullptr}); + } + + inline void thenExecute(tf::gpu::Stream *stream, sstl::FixedFunction func) + { + queueAction(stream, {{}, std::move(func), nullptr}); + } + +private: + // Posting action from other threads + struct PendingAction + { + uint64_t count; // num of SMs to release + sstl::FixedFunction func; // action to execute + std::unique_ptr event; // perform action after this event + }; + + using PendingActions = std::vector; + + std::unique_ptr allocEvent(); + + void queueAction(tf::gpu::Stream *stream, PendingAction action); + + void startPollingLoop(); + void stopPollingLoop(); + + void pollLoop(); + PendingActions pollEvents(); + void executeReady(PendingActions &ready); + + // pending actions waiting for its events, in order + std::list m_pendingActions; + + // Threading related variables + sstl::notification m_stopPolling; + sstl::notification m_pollingStopped; + + ThreadPool m_pool; + + // other threads put actions into this queue, which will be regularly picked up by polling thread + PendingActions m_stagedEvents GUARDED_BY(m_mu); + std::mutex m_mu; + sstl::notification m_eventsStaging; + + // GPU Event related variables + tf::gpu::StreamExecutor * const m_se; + + // Free events + std::vector> m_freeEvents GUARDED_BY(m_mu); +}; + +} // namespace salus::oplib::tensorflow + +#endif // SALUS_OPLIB_TENSORFLOW_SMEVENTPOLLER_H diff --git a/src/oplibraries/tensorflow/device/shadowdevices.h b/src/oplibraries/tensorflow/device/shadowdevices.h index 6690bd0..df5cfe3 100644 --- a/src/oplibraries/tensorflow/device/shadowdevices.h +++ b/src/oplibraries/tensorflow/device/shadowdevices.h @@ -111,6 +111,8 @@ class ShadowDevice : public tf::Device ~ShadowDevice() override; + sstl::not_null base() const { return m_base; } + // Hook allocators tf::Allocator *GetAllocator(tf::AllocatorAttributes attr) override; tf::Allocator *GetStepAllocator(tf::AllocatorAttributes attr, diff --git a/src/oplibraries/tensorflow/v3/smblocker.cpp b/src/oplibraries/tensorflow/v3/smblocker.cpp index 4220473..7114701 100644 --- a/src/oplibraries/tensorflow/v3/smblocker.cpp +++ b/src/oplibraries/tensorflow/v3/smblocker.cpp @@ -69,11 +69,9 @@ void salus_kernel_launch_callback(unsigned int gridDimX, unsigned int gridDimY, {blockDimX, blockDimY, blockDimZ}, sharedMemBytes, }); - if (false) { - LOG(DEBUG) << "Got kernel launch params: blk=(" - << gridDimX << "," << gridDimY << "," << gridDimZ - << ") x thd=(" << blockDimX << "," << blockDimY << "," << blockDimZ << "), " << sharedMemBytes; - } + VLOG(3) << "Got kernel launch params: blk=(" + << gridDimX << "," << gridDimY << "," << gridDimZ + << ") x thd=(" << blockDimX << "," << blockDimY << "," << blockDimZ << "), " << sharedMemBytes; } } // extern "C" @@ -103,21 +101,25 @@ SMBlocker::SMBlocker() { } +uint64_t SMBlocker::currentThreadSMHolding() const +{ + return CurrentThreadHoldingBlocks; +} + void SMBlocker::saveCurrentThreadResults(uint64_t graphId, int nodeId) { - LOG(DEBUG) << "Release at SMBlocker: graph " << graphId << " node " << nodeId - << " sm " << CurrentThreadHoldingBlocks; - // release blocks first - { - m_freeBlocks.post(CurrentThreadHoldingBlocks); - CurrentThreadHoldingBlocks = 0; - } + // reset current thread value + CurrentThreadHoldingBlocks = 0; // update cache std::unique_lock l{m_mu}; SMUsage newUsage{0, 0}; + LOG(DEBUG) << "SavedCudaKernelLaunches " << SavedCudaKernelLaunches.size(); for (const auto &res : SavedCudaKernelLaunches) { + LOG(DEBUG) << "SavedCudaKernelLaunches: blk=(" + << res.blockCount.x << "," << res.blockCount.y << "," << res.blockCount.z + << ") x thd=(" << res.threadPerBlock.x << "," << res.threadPerBlock.y << "," << res.threadPerBlock.z << ")"; newUsage.threadPerBlock = max(newUsage.threadPerBlock, res.threadPerBlock); newUsage.blockCount = max(newUsage.blockCount, res.blockCount); } @@ -141,7 +143,7 @@ bool SMBlocker::tryTake(uint64_t graphId, int nodeId, int priority) if (res) { // save the count CurrentThreadHoldingBlocks = smUsage; - LOG(DEBUG) << "Passed at SMBlocker: graph " << graphId << " node " << nodeId + LogSMTracing() << "Passed at SMBlocker: graph " << graphId << " node " << nodeId << " sm " << smUsage << " priority " << priority; } return res; @@ -154,10 +156,10 @@ void SMBlocker::wait(uint64_t graphId, int nodeId, int priority) // save the count CurrentThreadHoldingBlocks = smUsage; - LOG(DEBUG) << "Wait at SMBlocker: graph " << graphId << " node " << nodeId + LogSMTracing() << "Wait at SMBlocker: graph " << graphId << " node " << nodeId << " sm " << smUsage << " priority " << priority; m_freeBlocks.wait(smUsage, priority); - LOG(DEBUG) << "Passed at SMBlocker: graph " << graphId << " node " << nodeId + LogSMTracing() << "Passed at SMBlocker: graph " << graphId << " node " << nodeId << " sm " << smUsage << " priority " << priority; } @@ -170,4 +172,9 @@ uint64_t SMBlocker::getUsageForKernel(uint64_t graphId, int nodeId) return std::min(usage.blockCount, m_maxUsage.blockCount); } +void SMBlocker::release(uint64_t numSms) +{ + m_freeBlocks.post(numSms); +} + } // namespace salus::oplib::tensorflow diff --git a/src/oplibraries/tensorflow/v3/smblocker.h b/src/oplibraries/tensorflow/v3/smblocker.h index 4ef2ed3..89a54fb 100644 --- a/src/oplibraries/tensorflow/v3/smblocker.h +++ b/src/oplibraries/tensorflow/v3/smblocker.h @@ -57,6 +57,22 @@ class SMBlocker public: static SMBlocker &instance(); + /** + * @brief Release this amount of numSms + */ + void release(uint64_t numSms); + + /** + * @brief Return the number of sms held by current thread + * @return + */ + uint64_t currentThreadSMHolding() const; + + /** + * @brief Save current thread's launch parameter + * @param graphId + * @param nodeId + */ void saveCurrentThreadResults(uint64_t graphId, int nodeId); /** @@ -69,7 +85,7 @@ class SMBlocker bool tryTake(uint64_t graphId, int nodeId, int priority); /** - * @brief Blocking wait + * @brief Blocking wait, takes SMs * @param graphId * @param nodeId * @param priority diff --git a/src/platform/logging.cpp b/src/platform/logging.cpp index 369ceb0..9547064 100644 --- a/src/platform/logging.cpp +++ b/src/platform/logging.cpp @@ -117,7 +117,7 @@ void initialize(const Params ¶ms) // Force to create loggers here with default configuration // in non-performance sensitive code path. - for (auto tag : {logging::kAllocTag, logging::kOpTracing, logging::kPerfTag, logging::kDefTag}) { + for (auto tag : {logging::kSMTag, logging::kAllocTag, logging::kOpTracing, logging::kPerfTag, logging::kDefTag}) { auto logger = Loggers::getLogger(tag); DCHECK(logger); } diff --git a/src/platform/logging.h b/src/platform/logging.h index 0f1853e..6ff6355 100644 --- a/src/platform/logging.h +++ b/src/platform/logging.h @@ -116,6 +116,7 @@ constexpr const auto kAllocTag = "alloc"; constexpr const auto kPerfTag = "performance"; constexpr const auto kOpTracing = "optracing"; constexpr const auto kDefTag = "default"; +constexpr const auto kSMTag = "smtracing"; // logging configurations struct Params @@ -133,6 +134,7 @@ void initialize(const Params ¶ms); #define LogPerf() CLOG(TRACE, logging::kPerfTag) #define LogAlloc() CLOG(TRACE, logging::kAllocTag) #define LogOpTracing() CLOG(TRACE, logging::kOpTracing) +#define LogSMTracing() CLOG(TRACE, logging::kSMTag) // Additional operator<< implementations MAKE_LOGGABLE(std::exception_ptr, ep, os); diff --git a/src/wrapper/CMakeLists.txt b/src/wrapper/CMakeLists.txt new file mode 100644 index 0000000..a2eef04 --- /dev/null +++ b/src/wrapper/CMakeLists.txt @@ -0,0 +1,2 @@ +# configure and install helper script +configure_file(salus-server.in salus-server @ONLY) diff --git a/src/wrapper/salus-server.in b/src/wrapper/salus-server.in new file mode 100755 index 0000000..315199b --- /dev/null +++ b/src/wrapper/salus-server.in @@ -0,0 +1,30 @@ +#! /bin/bash + +# Get current file directory +SOURCE="${BASH_SOURCE[0]}" +while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink + DIR="$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )" + SOURCE="$(readlink "$SOURCE")" + [[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE" # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located +done +DIR="$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )" + +# Find salus executable, it should be in the same directory as this script +SALUS_EXECUTABLE=$DIR/salus-server-exec +# Find cudahook library, it should be either in ../lib/libcudahook.so, +# or, if we are in the build tree, in ./cudahook/libcudahook.so +LIBCUDAHOOK=$(realpath $DIR/../lib/libcudahook.so 2>/dev/null) +if [[ ! -f "$LIBCUDAHOOK" ]]; then + LIBCUDAHOOK=$(realpath $DIR/cudahook/libcudahook.so 2>/dev/null) +fi + +if [[ ! -f "$LIBCUDAHOOK" ]]; then + echo "libcudahook.so not found!" >2 + exit -1 +fi + +TF_MIN_CPP_LOG_LEVEL=${TF_MIN_CPP_LOG_LEVEL:-4} +CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-2,3} + +export LD_PRELOAD=$LIBCUDAHOOK +exec $SALUS_EXECUTABLE "$@" From 19df3e46f945345a59bdcc4cc895101bf0a79c65 Mon Sep 17 00:00:00 2001 From: Peifeng Yu Date: Thu, 25 Apr 2019 13:20:02 -0400 Subject: [PATCH 11/13] New SM tracing exp --- benchmarks/driver/runner.py | 21 ++++- benchmarks/exps/smtracing.py | 175 +++++++++++++++++++++++++++++++++++ 2 files changed, 192 insertions(+), 4 deletions(-) create mode 100644 benchmarks/exps/smtracing.py diff --git a/benchmarks/driver/runner.py b/benchmarks/driver/runner.py index 36c8ab3..6d51252 100644 --- a/benchmarks/driver/runner.py +++ b/benchmarks/driver/runner.py @@ -118,8 +118,8 @@ def __call__(self, executor, output_file): '--num_batches={}'.format(self.wl.batch_num), '--batch_size={}'.format(self.wl.batch_size), ] - eval_interval = self.wl.env.pop('SALUS_TFBENCH_EVAL_INTERVAL', '0.1') - eval_rand_factor = self.wl.env.pop('SALUS_TFBENCH_EVAL_RAND_FACTOR', '5') + eval_interval = self.wl.env.pop('SALUS_TFBENCH_EVAL_INTERVAL', None) + eval_rand_factor = self.wl.env.pop('SALUS_TFBENCH_EVAL_RAND_FACTOR', None) eval_block = self.wl.env.pop('SALUS_TFBENCH_EVAL_BLOCK', 'true') eval_model_dir = self.wl.env.pop('SALUS_TFBENCH_EVAL_MODEL_DIR', 'models') @@ -146,11 +146,17 @@ def __call__(self, executor, output_file): cmd += [ '--model_dir=' + eval_model_dir, '--model={}'.format(model_name), - '--eval_interval_secs={}'.format(eval_interval), - '--eval_interval_random_factor={}'.format(eval_rand_factor), '--eval_block={}'.format(eval_block), '--eval' ] + if eval_interval is not None: + cmd += [ + '--eval_interval_secs={}'.format(eval_interval), + ] + if eval_rand_factor is not None: + cmd += [ + '--eval_interval_random_factor={}'.format(eval_rand_factor), + ] if eval_saved_model_dir is not None: cmd += [ '--saved_model_dir=' + eval_saved_model_dir @@ -165,6 +171,7 @@ def __call__(self, executor, output_file): ] cmd += self.wl.extra_args + logger.info(f'Starting workload with cmd: {cmd}') if FLAGS.no_capture: return execute(cmd, cwd=str(cwd), env=self.env) @@ -199,6 +206,8 @@ def __call__(self, executor, output_file): 'python', '-m', pkg, method, ] cmd += self.wl.extra_args + + logger.info(f'Starting workload with cmd: {cmd}') if FLAGS.no_capture: return execute(cmd, cwd=str(cwd), env=self.env) else: @@ -307,6 +316,7 @@ def __call__(self, executor, output_file): raise ValueError(f'Unknown executor: {executor}') cmd += self.wl.extra_args + logger.info(f'Starting workload with cmd: {cmd}') if FLAGS.no_capture: return execute(cmd, cwd=str(cwd), env=self.env) @@ -351,6 +361,7 @@ def __call__(self, executor, output_file): else: raise ValueError(f'Unknown executor: {executor}') cmd += self.wl.extra_args + logger.info(f'Starting workload with cmd: {cmd}') if FLAGS.no_capture: return execute(cmd, cwd=str(cwd), env=self.env) @@ -405,6 +416,7 @@ def __call__(self, executor, output_file): '--num_replicas', num_replicas ] cmd += self.wl.extra_args + logger.info(f'Starting workload with cmd: {cmd}') if FLAGS.no_capture: return execute(cmd, cwd=str(cwd), env=self.env) @@ -443,6 +455,7 @@ def __call__(self, executor, output_file): '-', ] cmd += self.wl.extra_args + logger.info(f'Starting workload with cmd: {cmd}') proc = execute(cmd, cwd=str(cwd), env=self.env, stdin=sp.PIPE) proc.stdin.write(self._plan_to_bytes()) diff --git a/benchmarks/exps/smtracing.py b/benchmarks/exps/smtracing.py new file mode 100644 index 0000000..91c61ba --- /dev/null +++ b/benchmarks/exps/smtracing.py @@ -0,0 +1,175 @@ +# -*- coding: future_fstrings -*- +# +# Copyright 2019 Peifeng Yu +# +# This file is part of Salus +# (see https://github.com/SymbioticLab/Salus). +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +""" +SM Tracing: Experiment one inference job with one training job + +Almost the same as Card 304, but with proper SM tracing implemented. + +The propurse of this experiment is to tune and debug the SM tracing pipeline. + +- reduce the inference latency, and see if the tail latency for training reduces + +Record inference latency. Compare inference job latency running along vs. running with a training job. + +The latency should be measured with increasing throughput (qps) for the inference job. + +Collected data: inference per iteration speed (latency), training throughput (derived from per iteration speed) +""" +from __future__ import absolute_import, print_function, division, unicode_literals + +import tempfile +from typing import Sequence + +from absl import flags +import logging +import os + +from benchmarks.driver.server.config import presets +from benchmarks.driver.workload import WTL, Executor +from benchmarks.driver.utils.compatiblity import pathlib +from benchmarks.exps import ( + run_seq, maybe_forced_preset, RunFn, Pause, wait_on_pipe, release_on_pipe, + case_switch_main, + run_tfdist, run_tf +) + + +FLAGS = flags.FLAGS +logger = logging.getLogger(__name__) + + +def set_env(wl): + wl.env['SALUS_TFBENCH_EVAL_BLOCK'] = 'true' + + model_dir = pathlib.Path('~/../symbiotic/peifeng/tf_cnn_benchmarks_models/legacy_checkpoint_models') + model_dir = model_dir.expanduser().resolve() + wl.env['SALUS_TFBENCH_EVAL_MODEL_DIR'] = model_dir + + +def create_train(executor, idx, td): + # the batch number has no effect here, only used to distinguish different runs + train_wl = WTL.create('inception4', 50, 100 + idx, executor=executor) + # make sure it runs long enough + train_wl.env['SALUS_ITER_SECONDS'] = '300' + + # create a pipe to signal train_wl + pipetrain = str(pathlib.Path(td).joinpath('fifotrain')) + os.mkfifo(pipetrain) + train_wl.env['SALUS_WAIT_FOR_SIGNAL'] = pipetrain + return train_wl, pipetrain + + +def create_infer(executor, name, bs, batch_num, td): + wl = WTL.create(name, bs, batch_num, executor=executor) + set_env(wl) + wl.env['SALUS_ITER_SECONDS'] = '150' + wl.extra_args += [ + '--eval_interval_secs=0.02', + # '--eval_interval_random_factor=5' + ] + + pipe = str(pathlib.Path(td).joinpath('fifo')) + os.mkfifo(pipe) + wl.env['SALUS_WAIT_FOR_SIGNAL'] = pipe + + return wl, pipe + + +def salus(argv): + # type: (Sequence[str]) -> None + scfg = maybe_forced_preset(presets.MostEfficient) + + name = "alexneteval" + if len(argv) > 1: + name = argv[0] + batch_sizes = [int(v) for v in argv[1:]] + + if not batch_sizes: + batch_sizes = [1, 2, 4, 8] + + batch_num = 300 + # batch_sizes = [1, 2, 4, 8, 16, 32] + # batch_sizes = [1024, 1536, 2048, 4096] + for idx, bs in enumerate(batch_sizes): + with tempfile.TemporaryDirectory() as td: + # create a background training job + train_wl, pipetrain = create_train(Executor.Salus, idx, td) + + # create the foreground inference job + wl, pipe = create_infer(Executor.Salus, name, bs, batch_num, td) + + run_seq(scfg.copy(output_dir=FLAGS.save_dir / "salus" / (name + "-inception4")), + train_wl, # start the background job + wl, # start the foreground job + # wait for both jobs to be ready + RunFn(lambda *args, **kwargs: wait_on_pipe(pipetrain)), + RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)), + # start train job + RunFn(lambda *args, **kwargs: release_on_pipe(pipetrain)), + # wait 10 seconds + Pause(10), + # release inference job + RunFn(lambda *args, **kwargs: release_on_pipe(pipe)), + # run_seq automatically join all jobs at the end of the sequence + ) + + +def tfmps(argv): + # type: (Sequence[str]) -> None + name = "alexneteval" + if len(argv) > 1: + name = argv[0] + batch_sizes = [int(v) for v in argv[1:]] + + if not batch_sizes: + batch_sizes = [1, 2, 4, 8] + + batch_num = 300 + # batch_sizes = [1, 2, 4, 8, 16, 32] + # batch_sizes = [1024, 1536, 2048, 4096] + for idx, bs in enumerate(batch_sizes): + with tempfile.TemporaryDirectory() as td: + # create a background training job + train_wl, pipetrain = create_train(Executor.TF, idx, td) + train_wl.extra_args += ['--min_mem'] + + # create the foreground inference job + wl, pipe = create_infer(Executor.TF, name, bs, batch_num, td) + wl.extra_args += ['--min_mem'] + + run_tf(FLAGS.save_dir / "tfmps" / (name + "-inception4"), + train_wl, # start the background job + wl, # start the foreground job + # wait for both jobs to be ready + RunFn(lambda *args, **kwargs: wait_on_pipe(pipetrain)), + RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)), + # start train job + RunFn(lambda *args, **kwargs: release_on_pipe(pipetrain)), + # wait 10 seconds + Pause(10), + # release inference job + RunFn(lambda *args, **kwargs: release_on_pipe(pipe)), + # run_seq automatically join all jobs at the end of the sequence + ) + + +@case_switch_main +def main(): + return salus, tfmps From b111dc91e2e3ab9d703b2b9fec6268d906ce4aac Mon Sep 17 00:00:00 2001 From: Peifeng Yu Date: Thu, 26 Dec 2019 22:03:00 -0500 Subject: [PATCH 12/13] Fix CI --- .gitlab-ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 5ab7ce7..c266979 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -5,6 +5,8 @@ stages: variables: DOCKER_HOST: tcp://docker:2375 + # This will instruct Docker not to start over TLS. + DOCKER_TLS_CERTDIR: "" DOCKER_DRIVER: overlay2 IMAGE_TAG: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG LATEST_TAG: $CI_REGISTRY_IMAGE:latest From 01a36d643c2ca6c0a64b40d725895a96862c00c2 Mon Sep 17 00:00:00 2001 From: Peifeng Yu Date: Mon, 13 Jan 2020 17:51:52 -0500 Subject: [PATCH 13/13] Update code --- CMakeLists.txt | 4 + benchmarks/driver/runner.py | 7 +- benchmarks/driver/server/config/__init__.py | 4 +- benchmarks/exps/__init__.py | 7 + benchmarks/exps/smtracing.py | 27 +- benchmarks/exps/tune_pending.py | 389 +++++++++++++++++++ scripts/parse_card250.py | 17 +- scripts/parse_card260.py | 16 +- scripts/parse_card271.py | 15 +- scripts/parse_card272.py | 14 +- scripts/parse_card274.py | 19 +- scripts/parse_exp17.py | 25 +- scripts/plotutils.py | 14 +- src/config.h.in | 1 + src/main.cpp | 27 +- src/oplibraries/tensorflow/v3/smblocker.cpp | 16 +- src/oplibraries/tensorflow/v3/smblocker.h | 40 +- tests/test_tf/lib/seq2seq/ptb/ptb_word_lm.py | 11 +- tests/test_tf/test_mnist_tf.py | 6 + tests/test_tf/test_seq.py | 8 +- tests/test_tf/test_super_res.py | 1 + 21 files changed, 593 insertions(+), 75 deletions(-) create mode 100644 benchmarks/exps/tune_pending.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 9169cba..fb9af16 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -148,6 +148,10 @@ if(WITH_TIMEOUT_WARNING) set(SALUS_ENABLE_TIMEOUT_WARNING 1) endif(WITH_TIMEOUT_WARNING) +if(USE_TENSORFLOW) + set(SALUS_ENABLE_TENSORFLOW 1) +endif(USE_TENSORFLOW) + configure_file(src/config.h.in ${CMAKE_CURRENT_BINARY_DIR}/config.h) include_directories(${CMAKE_CURRENT_BINARY_DIR}) diff --git a/benchmarks/driver/runner.py b/benchmarks/driver/runner.py index 6d51252..88fa2a3 100644 --- a/benchmarks/driver/runner.py +++ b/benchmarks/driver/runner.py @@ -213,7 +213,8 @@ def __call__(self, executor, output_file): else: output_file.parent.mkdir(exist_ok=True, parents=True) with output_file.open('w') as f: - return execute(cmd, cwd=str(cwd), env=env, stdout=f, stderr=sp.STDOUT) + # return execute(cmd, cwd=str(cwd), env=env, stdout=f, stderr=sp.STDOUT) + return execute(cmd, cwd=str(cwd), env=env, stdout=f, stderr=None) def _construct_test_name(self, executor): # type: (Executor) -> Tuple[str, str] @@ -239,7 +240,7 @@ def _construct_test_name(self, executor): }) } - variable_batch_size_models = {'vae', 'superres'} + variable_batch_size_models = {'vae', 'superres', 'seq2seq', 'mnistsf', 'mnistcv', 'mnistlg'} if remove_suffix(self.wl.name, 'eval') not in variable_batch_size_models: if self.wl.batch_size not in self.wl.wtl.available_batch_sizes(): raise ValueError(f"Batch size `{self.wl.batch_size}' is not supported for {self.wl.name}," @@ -273,6 +274,8 @@ def _construct_test_name(self, executor): } postfix = names.get(self.wl.batch_size, '0') + if model_name == 'seq2seq' and postfix == '0': + postfix = '2_large' method = f'{cls}.{prefix}{postfix}' return pkg, method diff --git a/benchmarks/driver/server/config/__init__.py b/benchmarks/driver/server/config/__init__.py index 2055115..dad744b 100644 --- a/benchmarks/driver/server/config/__init__.py +++ b/benchmarks/driver/server/config/__init__.py @@ -21,7 +21,7 @@ from builtins import super from absl import flags -from copy import copy +from copy import deepcopy from ...utils import maybe_path from ...utils.compatiblity import pathlib @@ -77,7 +77,7 @@ def __setattr__(self, key, value): def copy(self, **kwargs): # type: (...) -> SalusConfig """Return a new copy of the tuple""" - return copy(self).update(**kwargs) + return deepcopy(self).update(**kwargs) def update(self, d=None, **kwargs): # type: (...) -> SalusConfig diff --git a/benchmarks/exps/__init__.py b/benchmarks/exps/__init__.py index 9262e05..b093421 100644 --- a/benchmarks/exps/__init__.py +++ b/benchmarks/exps/__init__.py @@ -23,6 +23,8 @@ import time import re import logging +import string +import random from absl import flags from typing import Union, Iterable, List, TypeVar, Callable, Optional @@ -379,3 +381,8 @@ def release_on_pipe(pipe): def sync_on_pipe(pipe): wait_on_pipe(pipe) release_on_pipe(pipe) + + +def random_id(size=6, chars=string.ascii_uppercase + string.digits): + """Generate a random ID""" + return ''.join(random.choice(chars) for _ in range(size)) diff --git a/benchmarks/exps/smtracing.py b/benchmarks/exps/smtracing.py index 91c61ba..e1e6d19 100644 --- a/benchmarks/exps/smtracing.py +++ b/benchmarks/exps/smtracing.py @@ -170,6 +170,31 @@ def tfmps(argv): ) +def train_alone(argv): + """Run training workload alone take note of SM usage""" + sm_factors = [float(v) for v in argv] + if not sm_factors: + sm_factors = [1.0, 1.5, 2.0, 2.5, 3.0] + + logger.info(f"Running Salus with sm factors: {sm_factors}") + + # run salus + for factor in sm_factors: + with tempfile.TemporaryDirectory() as td: + scfg = maybe_forced_preset(presets.OpTracing) + scfg.logconf = 'smtracing' + scfg.extra_args += [ + '--sm-factor', f'{factor:.2f}' + ] + logger.info(f"Running Salus with sm factor: {factor}") + # the background training job + wl, pipe = create_train(Executor.Salus, 0, td) + run_seq(scfg.copy(output_dir=FLAGS.save_dir / "alone" / f"{factor:.2f}"), + wl, + RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)), + RunFn(lambda *args, **kwargs: release_on_pipe(pipe))) + + @case_switch_main def main(): - return salus, tfmps + return salus, tfmps, train_alone, salus_factor diff --git a/benchmarks/exps/tune_pending.py b/benchmarks/exps/tune_pending.py new file mode 100644 index 0000000..7371654 --- /dev/null +++ b/benchmarks/exps/tune_pending.py @@ -0,0 +1,389 @@ +# -*- coding: future_fstrings -*- +# +# Copyright 2019 Peifeng Yu +# +# This file is part of Salus +# (see https://github.com/SymbioticLab/Salus). +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +""" +Tune pending: Experiment one inference job with one training job + +Almost the same as Card 304, but try to tune the pending parameter. + +The propurse of this experiment is to tune and debug the SM tracing pipeline. + +- reduce the inference latency, and see if the tail latency for training reduces + +Record inference latency. Compare inference job latency running along vs. running with a training job. + +The latency should be measured with increasing throughput (qps) for the inference job. + +Collected data: inference per iteration speed (latency), training throughput (derived from per iteration speed) +""" +from __future__ import absolute_import, print_function, division, unicode_literals + +import tempfile +from typing import Sequence + +from absl import flags +import logging +import os + +from benchmarks.driver.server.config import presets +from benchmarks.driver.workload import WTL, Executor +from benchmarks.driver.utils.compatiblity import pathlib +from benchmarks.exps import ( + run_seq, maybe_forced_preset, RunFn, Pause, wait_on_pipe, release_on_pipe, + case_switch_main, + run_tfdist, run_tf, + random_id, +) + + +FLAGS = flags.FLAGS +logger = logging.getLogger(__name__) + + +def set_env(wl): + wl.env['SALUS_TFBENCH_EVAL_BLOCK'] = 'true' + + model_dir = pathlib.Path('~/../symbiotic/peifeng/tf_cnn_benchmarks_models/legacy_checkpoint_models') + model_dir = model_dir.expanduser().resolve() + wl.env['SALUS_TFBENCH_EVAL_MODEL_DIR'] = model_dir + + +def create_train(executor, idx, td=None): + # the batch number has no effect here, only used to distinguish different runs + train_wl = WTL.create('inception4', 50, 100 + idx, executor=executor) + # make sure it runs long enough + train_wl.env['SALUS_ITER_SECONDS'] = '300' + + if td is not None: + # create a pipe to signal train_wl + pipetrain = str(pathlib.Path(td) / f'{train_wl.canonical_name}-{random_id()}-fifo') + os.mkfifo(pipetrain) + train_wl.env['SALUS_WAIT_FOR_SIGNAL'] = pipetrain + return train_wl, pipetrain + return train_wl + + +def create_infer(executor, bs, td=None): + wl = WTL.create('vgg11eval', bs, 300, executor=executor) + set_env(wl) + wl.env['SALUS_ITER_SECONDS'] = '150' + wl.extra_args += [ + # '--eval_interval_secs=0.02', + # '--eval_interval_random_factor=5' + ] + + if td is not None: + pipe = str(pathlib.Path(td) / f'{wl.canonical_name}-{random_id()}-fifo') + os.mkfifo(pipe) + wl.env['SALUS_WAIT_FOR_SIGNAL'] = pipe + return wl, pipe + + return wl + + +def alone_tf(_argv): + # run tf + # the foreground inference job + wl = create_infer(Executor.TF, 10) + wl.extra_args += ['--min_mem'] + run_tf(FLAGS.save_dir / "alone", wl) + + # the background training job + wl = create_train(Executor.TF, 0) + wl.extra_args += ['--min_mem'] + run_tf(FLAGS.save_dir / "alone", wl) + + +def alone(argv): + """Run each workload alone for reference""" + sm_factors = [float(v) for v in argv] + if not sm_factors: + sm_factors = [1.0, 1.5, 2.0, 2.5, 3.0] + + logger.info(f"Running Salus with sm factors: {sm_factors}") + + # run salus + for factor in sm_factors: + scfg = maybe_forced_preset(presets.MostEfficient) + scfg.extra_args += [ + '--sm-factor', f'{factor:.2f}' + ] + logger.info(f"Running Salus with sm factor: {factor}") + wl = create_infer(Executor.Salus, 10) + run_seq(scfg.copy(output_dir=FLAGS.save_dir / "alone" / f"{factor:.2f}"), wl) + + # the background training job + wl = create_train(Executor.Salus, 0) + run_seq(scfg.copy(output_dir=FLAGS.save_dir / "alone" / f"{factor:.2f}"), wl) + + +def salus(argv): + # type: (Sequence[str]) -> None + base_cfg = maybe_forced_preset(presets.MostEfficient) + + sm_factors = [float(v) for v in argv] + if not sm_factors: + sm_factors = [1.0, 1.5, 2.0, 2.5, 3.0] + + for idx, factor in enumerate(sm_factors): + scfg = base_cfg.copy(output_dir=FLAGS.save_dir / "salus" / f"{factor:.2f}") + scfg.extra_args += [ + '--sm-factor', f'{factor:.2f}' + ] + with tempfile.TemporaryDirectory() as td: + # create a background training job + train_wl, pipetrain = create_train(Executor.Salus, 0, td) + + # create the foreground inference job + wl, pipe = create_infer(Executor.Salus, 10, td) + + run_seq(scfg, + train_wl, # start the background job + wl, # start the foreground job + # wait for both jobs to be ready + RunFn(lambda *args, **kwargs: wait_on_pipe(pipetrain)), + RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)), + # start train job + RunFn(lambda *args, **kwargs: release_on_pipe(pipetrain)), + # wait 10 seconds + Pause(10), + # release inference job + RunFn(lambda *args, **kwargs: release_on_pipe(pipe)), + # run_seq automatically join all jobs at the end of the sequence + ) + + +def inverse_salus(argv): + # type: (Sequence[str]) -> None + """Inversed priority for training and inference""" + base_cfg = maybe_forced_preset(presets.MostEfficient) + + sm_factors = [float(v) for v in argv] + if not sm_factors: + sm_factors = [1.0, 1.5, 2.0, 2.5, 3.0] + + for idx, factor in enumerate(sm_factors): + scfg = base_cfg.copy(output_dir=FLAGS.save_dir / "inverse" / f"{factor:.2f}") + scfg.extra_args += [ + '--sm-factor', f'{factor:.2f}' + ] + with tempfile.TemporaryDirectory() as td: + # create a background training job + train_wl, pipetrain = create_train(Executor.Salus, 0, td) + + # create the foreground inference job + wl, pipe = create_infer(Executor.Salus, 10, td) + wl.extra_args += [ + '--eval_sched_priority', '40' + ] + + run_seq(scfg, + train_wl, # start the background job + wl, # start the foreground job + # wait for both jobs to be ready + RunFn(lambda *args, **kwargs: wait_on_pipe(pipetrain)), + RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)), + # start train job + RunFn(lambda *args, **kwargs: release_on_pipe(pipetrain)), + # wait 10 seconds + Pause(10), + # release inference job + RunFn(lambda *args, **kwargs: release_on_pipe(pipe)), + # run_seq automatically join all jobs at the end of the sequence + ) + + +def same_pri_salus(argv): + # type: (Sequence[str]) -> None + """Inversed priority for training and inference""" + base_cfg = maybe_forced_preset(presets.MostEfficient) + + sm_factors = [float(v) for v in argv] + if not sm_factors: + sm_factors = [1.0, 1.5, 2.0, 2.5, 3.0] + + for idx, factor in enumerate(sm_factors): + scfg = base_cfg.copy(output_dir=FLAGS.save_dir / "same_pri" / f"{factor:.2f}") + scfg.extra_args += [ + '--sm-factor', f'{factor:.2f}' + ] + with tempfile.TemporaryDirectory() as td: + # create a background training job + train_wl, pipetrain = create_train(Executor.Salus, 0, td) + + # create the foreground inference job + wl, pipe = create_infer(Executor.Salus, 10, td) + wl.extra_args += [ + '--eval_sched_priority', '20' + ] + + run_seq(scfg, + train_wl, # start the background job + wl, # start the foreground job + # wait for both jobs to be ready + RunFn(lambda *args, **kwargs: wait_on_pipe(pipetrain)), + RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)), + # start train job + RunFn(lambda *args, **kwargs: release_on_pipe(pipetrain)), + # wait 10 seconds + Pause(10), + # release inference job + RunFn(lambda *args, **kwargs: release_on_pipe(pipe)), + # run_seq automatically join all jobs at the end of the sequence + ) + + +def tfmps(argv): + # type: (Sequence[str]) -> None + batch_sizes = [int(v) for v in argv[1:]] + + if not batch_sizes: + batch_sizes = [1, 2, 4, 8] + + for idx, bs in enumerate(batch_sizes): + with tempfile.TemporaryDirectory() as td: + # create a background training job + train_wl, pipetrain = create_train(Executor.TF, idx, td) + train_wl.extra_args += ['--min_mem'] + + # create the foreground inference job + wl, pipe = create_infer(Executor.TF, bs, td) + wl.extra_args += ['--min_mem'] + + run_tf(FLAGS.save_dir / "tfmps", + train_wl, # start the background job + wl, # start the foreground job + # wait for both jobs to be ready + RunFn(lambda *args, **kwargs: wait_on_pipe(pipetrain)), + RunFn(lambda *args, **kwargs: wait_on_pipe(pipe)), + # start train job + RunFn(lambda *args, **kwargs: release_on_pipe(pipetrain)), + # wait 10 seconds + Pause(10), + # release inference job + RunFn(lambda *args, **kwargs: release_on_pipe(pipe)), + # run_seq automatically join all jobs at the end of the sequence + ) + + +def twoinfer_tfmps(argv): + # type: (Sequence[str]) -> None + batch_sizes = [int(v) for v in argv] + + if not batch_sizes: + batch_sizes = [1, 2, 4, 8] + + for idx, bs in enumerate(batch_sizes): + with tempfile.TemporaryDirectory() as td: + # create the foreground inference job + wl1, pipe1 = create_infer(Executor.TF, bs, td) + wl1.extra_args += ['--min_mem'] + # create the foreground inference job + wl2, pipe2 = create_infer(Executor.TF, bs, td) + wl2.extra_args += ['--min_mem'] + + run_tf(FLAGS.save_dir / "twoinfer" / "tfmps", + wl1, # start the background job + wl2, # start the foreground job + # wait for both jobs to be ready + RunFn(lambda *args, **kwargs: wait_on_pipe(pipe1)), + RunFn(lambda *args, **kwargs: wait_on_pipe(pipe2)), + # start train job + RunFn(lambda *args, **kwargs: release_on_pipe(pipe1)), + # release inference job + RunFn(lambda *args, **kwargs: release_on_pipe(pipe2)), + # run_seq automatically join all jobs at the end of the sequence + ) + + +def twoinfer(argv): + # type: (Sequence[str]) -> None + base_cfg = maybe_forced_preset(presets.MostEfficient) + + sm_factors = [float(v) for v in argv] + if not sm_factors: + sm_factors = [1.0, 1.5, 2.0, 2.5, 3.0] + + for idx, factor in enumerate(sm_factors): + scfg = base_cfg.copy(output_dir=FLAGS.save_dir / "twoinfer" / "salus" / f"{factor:.2f}") + scfg.extra_args += [ + '--sm-factor', f'{factor:.2f}' + ] + with tempfile.TemporaryDirectory() as td: + # create the foreground inference job + wl1, pipe1 = create_infer(Executor.Salus, 10, td) + + # create the foreground inference job + wl2, pipe2 = create_infer(Executor.Salus, 10, td) + + run_seq(scfg, + wl1, # start the first job + wl2, # start the second job + # wait for both jobs to be ready + RunFn(lambda *args, **kwargs: wait_on_pipe(pipe1)), + RunFn(lambda *args, **kwargs: wait_on_pipe(pipe2)), + # start 1st job + RunFn(lambda *args, **kwargs: release_on_pipe(pipe1)), + # release 2nd job + RunFn(lambda *args, **kwargs: release_on_pipe(pipe2)), + # run_seq automatically join all jobs at the end of the sequence + ) + + +def twoinfer_pri(argv): + # type: (Sequence[str]) -> None + """Two inferences with difference priority""" + base_cfg = maybe_forced_preset(presets.MostEfficient) + + sm_factors = [float(v) for v in argv] + if not sm_factors: + sm_factors = [1.0, 1.5, 2.0, 2.5, 3.0] + + for idx, factor in enumerate(sm_factors): + scfg = base_cfg.copy(output_dir=FLAGS.save_dir / "twoinfer_pri" / "salus" / f"{factor:.2f}") + scfg.extra_args += [ + '--sm-factor', f'{factor:.2f}' + ] + with tempfile.TemporaryDirectory() as td: + # create the foreground inference job + wl1, pipe1 = create_infer(Executor.Salus, 10, td) + + # create the background inference job + wl2, pipe2 = create_infer(Executor.Salus, 10, td) + wl2.extra_args += [ + '--eval_sched_priority', '20' + ] + + run_seq(scfg, + wl1, # start the first job + wl2, # start the second job + # wait for both jobs to be ready + RunFn(lambda *args, **kwargs: wait_on_pipe(pipe1)), + RunFn(lambda *args, **kwargs: wait_on_pipe(pipe2)), + # start 1st job + RunFn(lambda *args, **kwargs: release_on_pipe(pipe1)), + # release 2nd job + RunFn(lambda *args, **kwargs: release_on_pipe(pipe2)), + # run_seq automatically join all jobs at the end of the sequence + ) + + +@case_switch_main +def main(): + return alone, salus, tfmps, twoinfer, twoinfer_tfmps, inverse_salus, same_pri_salus, twoinfer_pri diff --git a/scripts/parse_card250.py b/scripts/parse_card250.py index 3b33080..f347c47 100644 --- a/scripts/parse_card250.py +++ b/scripts/parse_card250.py @@ -1,16 +1,16 @@ #!/usr/bin/env python3 # # Copyright 2019 Peifeng Yu -# +# # This file is part of Salus # (see https://github.com/SymbioticLab/Salus). -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -127,12 +127,11 @@ def plot_speeds(df, total_kws=None, **kwargs): ax.legend().remove() ax.set_xlabel('Time (s)') - ax.set_ylabel('Images per second') + ax.set_ylabel('Images\nper second') return ax -path = 'logs/nsdi19' -def prepare_paper(path): +def prepare_paper(path='logs/nsdi19'): path = Path(path) df = load_speeds(path/'card250'/'case1') @@ -166,6 +165,6 @@ def prepare_paper(path): total_kws={'marker': 'None', 'zorder': -1, 'linewidth': 1}) fig.tight_layout() - fig.set_size_inches(3.25, 2.35, forward=True) - fig.savefig('/tmp/workspace/card250.pdf', dpi=300) + fig.set_size_inches(3.25, 1.5, forward=True) + fig.savefig('/tmp/workspace/card250.pdf', dpi=300, bbox_inches='tight') plt.close() diff --git a/scripts/parse_card260.py b/scripts/parse_card260.py index 317f0bd..a466f66 100644 --- a/scripts/parse_card260.py +++ b/scripts/parse_card260.py @@ -1,16 +1,16 @@ #!/usr/bin/env python3 # # Copyright 2019 Peifeng Yu -# +# # This file is part of Salus # (see https://github.com/SymbioticLab/Salus). -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -112,14 +112,14 @@ def plot_ratio(df, **kwargs): try: path except NameError: - path = '/tmp/workspace' + path = 'logs/nsdi19' -def prepare_paper(path): +def prepare_paper(path='logs/nsdi19'): with plt.style.context(['seaborn-paper', 'mypaper', 'gray']): df = load_data(path) fig, ax = plt.subplots() - fig.set_size_inches(3.25, 1.5, forward=True) + fig.set_size_inches(3.25, 1.2, forward=True) #plot_eval_pit_vs_speed(df, ax=ax) #ax.set_xlabel('Time (s)') @@ -132,4 +132,4 @@ def prepare_paper(path): fig.tight_layout() fig.savefig('/tmp/workspace/card260.pdf', dpi=300, bbox_inches='tight', pad_inches = .015) - plt.close() \ No newline at end of file + plt.close() diff --git a/scripts/parse_card271.py b/scripts/parse_card271.py index 03999be..1764e8b 100644 --- a/scripts/parse_card271.py +++ b/scripts/parse_card271.py @@ -1,16 +1,16 @@ #!/usr/bin/env python3 # # Copyright 2019 Peifeng Yu -# +# # This file is part of Salus # (see https://github.com/SymbioticLab/Salus). -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -129,15 +129,14 @@ def plot_makespan(df, **kwargs): return ax -path = 'logs/nsdi19' -def prepare_paper(path): +def prepare_paper(path='logs/nsdi19'): path = Path(path) with plt.style.context(['seaborn-paper', 'mypaper', 'color3']): # fifo = ju.load_trace(path/'card266'/'salus'/'trace.csv') df = load_data(path/'card271') fig, ax = plt.subplots() - fig.set_size_inches(3.25, 1.85, forward=True) + fig.set_size_inches(3.25, 1.3, forward=True) # set col order df = df[['Network', 'Salus', 'TF']] @@ -150,4 +149,4 @@ def prepare_paper(path): fig.tight_layout() fig.savefig('/tmp/workspace/card271.pdf', dpi=300) plt.close() - return df \ No newline at end of file + return df diff --git a/scripts/parse_card272.py b/scripts/parse_card272.py index 523b9e0..1ded432 100644 --- a/scripts/parse_card272.py +++ b/scripts/parse_card272.py @@ -1,16 +1,16 @@ #!/usr/bin/env python3 # # Copyright 2019 Peifeng Yu -# +# # This file is part of Salus # (see https://github.com/SymbioticLab/Salus). -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -116,7 +116,7 @@ def do_timelines(path): path = 'logs/nsdi19' -def prepare_paper(path): +def prepare_paper(path='logs/nsdi19'): path = Path(path) with plt.style.context(['seaborn-paper', 'mypaper', 'line12']): # also use color @@ -131,7 +131,7 @@ def prepare_paper(path): pack = load_data(path/'card272'/'case1'/'salus', 'case1.output') fig, ax = plt.subplots() - fig.set_size_inches(3.25, 1.85, forward=True) + fig.set_size_inches(3.25, 1.5, forward=True) jcts = pd.DataFrame({'FIFO': fifo.JCT, 'SRTF': srtf.JCT, 'PACK': pack.JCT, 'FAIR': fair.JCT}) plot_jcts(jcts, ax=ax, markevery=0.1, markersize=4, linewidth=1) @@ -141,4 +141,4 @@ def prepare_paper(path): fig.savefig('/tmp/workspace/card272-jct.pdf', dpi=300) plt.close() - return fifo, srtf, srtf_refine, fair, pack \ No newline at end of file + return fifo, srtf, srtf_refine, fair, pack diff --git a/scripts/parse_card274.py b/scripts/parse_card274.py index fa6584c..4b2fb60 100644 --- a/scripts/parse_card274.py +++ b/scripts/parse_card274.py @@ -1,16 +1,16 @@ #!/usr/bin/env python3 # # Copyright 2019 Peifeng Yu -# +# # This file is part of Salus # (see https://github.com/SymbioticLab/Salus). -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -35,6 +35,7 @@ #import seaborn as sns import numpy as np import matplotlib.pyplot as plt +import matplotlib.patches as mpatches from matplotlib import cycler import plotutils as pu @@ -94,11 +95,17 @@ def do_srtf2(path): plot_offset=-st_sec ) ax.set_xlim([0, ed_sec-st_sec]) + + # add a legend + ax.legend(handles=[ + mpatches.Patch(color='#b6b6b6', label='Queuing'), + mpatches.Patch(color='black', label='Active') + ], bbox_to_anchor=(0.85, 0.03), loc='lower right') ax.set_ylabel('Job #') ax.yaxis.set_ticks([0, 1, 2, 3, 4, 5]) - fig.set_size_inches(4.875, 2, forward=True) + fig.set_size_inches(4.875, 1.5, forward=True) fig.savefig('/tmp/workspace/card274-srtf-compute.pdf', dpi=300, bbox_inches='tight', pad_inches = .015) plt.close() @@ -146,7 +153,7 @@ def do_srtf3(path): ax.legend().remove() #fig.tight_layout() - fig.set_size_inches(1.625, 2, forward=True) + fig.set_size_inches(1.625, 1.5, forward=True) fig.savefig('/tmp/workspace/card274-srtf-mem.pdf', dpi=300, bbox_inches='tight', pad_inches = .015) plt.close() diff --git a/scripts/parse_exp17.py b/scripts/parse_exp17.py index e050424..1b4d4bb 100644 --- a/scripts/parse_exp17.py +++ b/scripts/parse_exp17.py @@ -1,16 +1,16 @@ #!/usr/bin/env python3 # # Copyright 2019 Peifeng Yu -# +# # This file is part of Salus # (see https://github.com/SymbioticLab/Salus). -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -73,20 +73,25 @@ def load_exp17(path): pits = pits.drop(['index', 'BatchSize', 'Network'], axis=1) pits = pits.rename(columns={'Model': 'Network'}).set_index('Network') +old_vae = pits.at['vae', 'Salus'] +pits.loc['vae', 'Salus'] = 1.2 +old_superres = pits.at['superres', 'Salus'] +pits.loc['superres', 'Salus'] = 1.2 + with plt.style.context(['seaborn-paper', 'mypaper', 'color3']): ax = pits.plot.bar(legend=None) pu.axhlines(1.0, ax=ax, color='k', linestyle='--', linewidth=1) - pu.bar_show_data(ax, pits.index.get_loc('superres'), pits.at['superres', 'Salus']) - pu.bar_show_data(ax, pits.index.get_loc('vae'), pits.at['vae', 'Salus']) + pu.bar_show_data(ax, pits.index.get_loc('superres'), 1.15, data_y=old_superres, fmt='{:.2f}') + pu.bar_show_data(ax, pits.index.get_loc('vae'), 1.13, data_y=old_vae, fmt='{:.2f}') - ax.set_ylim(0.9, 1.9) + ax.set_ylim(0.9, 1.15) ax.set_xlabel('Workloads') - ax.set_ylabel('Normalized Per Iteration\nTraining Time') + ax.set_ylabel('Normalized\nPer Iteration\nTraining Time') # ax.legend() ax.tick_params(axis='x', labelsize=7) - ax.figure.set_size_inches(3.25, 2.35, forward=True) + ax.figure.set_size_inches(3.25, 1.8, forward=True) ax.figure.tight_layout() - ax.figure.savefig('/tmp/workspace/exp17.pdf', dpi=300, bbox_inches='tight', pad_inches = .015) + ax.figure.savefig('/tmp/workspace/exp17.pdf', dpi=300, bbox_inches='tight', pad_inches=.015) plt.close() diff --git a/scripts/plotutils.py b/scripts/plotutils.py index cb4c45c..41da8e8 100644 --- a/scripts/plotutils.py +++ b/scripts/plotutils.py @@ -1,15 +1,15 @@ # # Copyright 2019 Peifeng Yu -# +# # This file is part of Salus # (see https://github.com/SymbioticLab/Salus). -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -435,7 +435,7 @@ def bar(df, width=0.8, ax=None, **kwargs): return ax -def bar_show_data(ax, x, y, fmt='{:.1f}', **kwargs): +def bar_show_data(ax, x, y, data_y=None, fmt='{:.1f}', **kwargs): kws = { 'xytext': [0, 7], 'textcoords': 'offset points', @@ -443,7 +443,9 @@ def bar_show_data(ax, x, y, fmt='{:.1f}', **kwargs): 'horizontalalignment': 'center', 'verticalalignment': 'top' } - ax.annotate(fmt.format(y), + if data_y is None: + data_y = y + ax.annotate(fmt.format(data_y), xy=[x, y], **{**kws, **kwargs}) diff --git a/src/config.h.in b/src/config.h.in index 16f5518..60bca31 100644 --- a/src/config.h.in +++ b/src/config.h.in @@ -12,6 +12,7 @@ #cmakedefine SALUS_ENABLE_EXCLUSIVE_ITER #cmakedefine SALUS_ENABLE_TIMEOUT_WARNING #cmakedefine SALUS_ENABLE_JSON_LOG +#cmakedefine SALUS_ENABLE_TENSORFLOW #define SALUS_BUILD_TYPE "@CMAKE_BUILD_TYPE@" diff --git a/src/main.cpp b/src/main.cpp index 03c966e..ab67d14 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -17,13 +17,18 @@ * limitations under the License. */ +#include "utils/macros.h" + +#ifdef SALUS_ENABLE_TENSORFLOW +#include "oplibraries/tensorflow/v3/smblocker.h" +#endif + #include "execution/executionengine.h" #include "resources/resources.h" #include "platform/logging.h" #include "platform/signals.h" #include "platform/profiler.h" #include "rpcserver/zmqserver.h" -#include "utils/macros.h" #include "utils/envutils.h" #include @@ -45,6 +50,7 @@ const static auto listen = "--listen"; const static auto maxHolWaiting = "--max-hol-waiting"; const static auto disableFairness = "--disable-fairness"; const static auto disableWorkConservative = "--disable-wc"; +const static auto smFactor = "--sm-factor"; const static auto scheduler = "--sched"; const static auto logConf = "--logconf"; @@ -77,6 +83,7 @@ Salus: Fine-Grained GPU Sharing for DNN. fairness is on. --max-hol-waiting= Maximum number of task allowed go before queue head in scheduling. [default: 50] + --sm-factor= Scale factor for # of SMs. [default: 1] -c , --logconf= Path to log configuration file. Note that settings in this file takes precedence over other command line arguments. @@ -214,6 +221,17 @@ void configureExecution(std::map &args) salus::ExecutionEngine::instance().setSchedulingParam({maxQueueHeadWaiting, !disableWorkConservative, sched}); } +void configureSMBlocker(std::map &args) +{ +#ifdef SALUS_ENABLE_TENSORFLOW + // docopt doesn't handle double number + // so we get as string and do conversion ourselves + auto scale = std::atof(value_or(args[flags::smFactor], "1.0"s).c_str()); + + salus::oplib::tensorflow::SMBlocker::setScaleFactorSM(scale); +#endif +} + void printConfiguration(std::map &) { LOG(INFO) << "Running build type: " << SALUS_BUILD_TYPE; @@ -237,6 +255,11 @@ void printConfiguration(std::map &) LOG(INFO) << " Policy: " << param.scheduler; LOG(INFO) << " MaxQueueHeadWaiting: " << param.maxHolWaiting; LOG(INFO) << " WorkConservative: " << (param.workConservative ? "on" : "off"); + +#ifdef SALUS_ENABLE_TENSORFLOW + LOG(INFO) << "GPU execution:"; + LOG(INFO) << " SM scale factor: " << salus::oplib::tensorflow::SMBlocker::scaleFactorSM(); +#endif } int main(int argc, char **argv) @@ -250,6 +273,8 @@ int main(int argc, char **argv) configureExecution(args); + configureSMBlocker(args); + printConfiguration(args); ScopedProfiling sp(value_or(args[flags::gperf], false)); diff --git a/src/oplibraries/tensorflow/v3/smblocker.cpp b/src/oplibraries/tensorflow/v3/smblocker.cpp index 7114701..f2eaf55 100644 --- a/src/oplibraries/tensorflow/v3/smblocker.cpp +++ b/src/oplibraries/tensorflow/v3/smblocker.cpp @@ -78,9 +78,11 @@ void salus_kernel_launch_callback(unsigned int gridDimX, unsigned int gridDimY, namespace salus::oplib::tensorflow { +double SMBlocker::m_scaleFactorSM = 0.0; + SMBlocker &SMBlocker::instance() { - static SMBlocker blocker; + static SMBlocker blocker(scaleFactorSM()); return blocker; } @@ -95,9 +97,9 @@ SMUsage SMBlocker::queryAvailableSM() }; } -SMBlocker::SMBlocker() - : m_maxUsage{queryAvailableSM()} - , m_freeBlocks(m_maxUsage.blockCount) +SMBlocker::SMBlocker(double factor) + : m_maxUsage{queryAvailableSM(), factor} + , m_freeBlocks(m_maxUsage.get().blockCount) { } @@ -159,7 +161,7 @@ void SMBlocker::wait(uint64_t graphId, int nodeId, int priority) LogSMTracing() << "Wait at SMBlocker: graph " << graphId << " node " << nodeId << " sm " << smUsage << " priority " << priority; m_freeBlocks.wait(smUsage, priority); - LogSMTracing() << "Passed at SMBlocker: graph " << graphId << " node " << nodeId + LogSMTracing() << "Took at SMBlocker: graph " << graphId << " node " << nodeId << " sm " << smUsage << " priority " << priority; } @@ -169,11 +171,13 @@ uint64_t SMBlocker::getUsageForKernel(uint64_t graphId, int nodeId) auto usage = sstl::getOrDefault(m_cache, {graphId, nodeId}, {}); - return std::min(usage.blockCount, m_maxUsage.blockCount); + return std::min(usage.blockCount, m_maxUsage.get().blockCount); } void SMBlocker::release(uint64_t numSms) { + LogSMTracing() << "Release at SMBlocker: graph " << 0 << " node " << 0 + << " sm " << numSms << " priority " << 0; m_freeBlocks.post(numSms); } diff --git a/src/oplibraries/tensorflow/v3/smblocker.h b/src/oplibraries/tensorflow/v3/smblocker.h index 89a54fb..714a871 100644 --- a/src/oplibraries/tensorflow/v3/smblocker.h +++ b/src/oplibraries/tensorflow/v3/smblocker.h @@ -57,6 +57,17 @@ class SMBlocker public: static SMBlocker &instance(); + static void setScaleFactorSM(double factor) + { + m_scaleFactorSM = factor; + } + + static double scaleFactorSM() + { + CHECK_NE(m_scaleFactorSM, 0.0) << "Must call SMBlocker::setScaleFactorSM before getting value"; + return m_scaleFactorSM; + } + /** * @brief Release this amount of numSms */ @@ -95,13 +106,38 @@ class SMBlocker static constexpr int MaxPriority = 100; private: + static double m_scaleFactorSM; static SMUsage queryAvailableSM(); - SMBlocker(); + explicit SMBlocker(double factor); uint64_t getUsageForKernel(uint64_t graphId, int nodeId); - const SMUsage m_maxUsage; + class MaxSMUsage + { + SMUsage usage; + double scale; + public: + explicit MaxSMUsage(SMUsage u, double scale = 1.0) + : usage(u) + , scale(scale) + {} + + SMUsage get() const { + return {usage.threadPerBlock, static_cast(usage.blockCount * scale)}; + } + double getScale() const { + return scale; + } + void set(SMUsage u) { + usage = u; + } + void setScale(double s) { + scale = s; + } + }; + + MaxSMUsage m_maxUsage; sstl::priority_semaphore m_freeBlocks; diff --git a/tests/test_tf/lib/seq2seq/ptb/ptb_word_lm.py b/tests/test_tf/lib/seq2seq/ptb/ptb_word_lm.py index bfe1ed6..6f9eb46 100644 --- a/tests/test_tf/lib/seq2seq/ptb/ptb_word_lm.py +++ b/tests/test_tf/lib/seq2seq/ptb/ptb_word_lm.py @@ -206,7 +206,7 @@ def run_epoch(self, session, eval_op=None, verbose=False): state = session.run(self.initial_state) eval_interval = os.environ.get('SALUS_TFBENCH_EVAL_INTERVAL', '0.1') - eval_rand_factor = os.environ.get('SALUS_TFBENCH_EVAL_RAND_FACTOR', '5') + eval_rand_factor = os.environ.get('SALUS_TFBENCH_EVAL_RAND_FACTOR', None) eval_block = os.environ.get('SALUS_TFBENCH_EVAL_BLOCK', 'true') if eval_block != 'true': @@ -242,10 +242,11 @@ def run_epoch(self, session, eval_op=None, verbose=False): print(fmt_str.format(datetime.now(), step, np.exp(costs / iters), local_speed, dur)) if self._train_op is None: - factor = 1 - if eval_rand_factor != "1": - factor = random.randint(1, int(eval_rand_factor)) - time.sleep(float(eval_interval) * factor) + if float(eval_interval) > 0: + factor = 1 + if eval_rand_factor is not None: + factor = random.randint(1, int(eval_rand_factor)) + time.sleep(float(eval_interval) * factor) return np.exp(costs / iters), speeds diff --git a/tests/test_tf/test_mnist_tf.py b/tests/test_tf/test_mnist_tf.py index bcdd15b..21b5496 100644 --- a/tests/test_tf/test_mnist_tf.py +++ b/tests/test_tf/test_mnist_tf.py @@ -16,6 +16,8 @@ def run_mnist_softmax(sess, batch_size=50): + batch_size = tfhelper.batch_size_from_env(batch_size) + print('Using batch_size {}'.format(batch_size)) x_image, y_, num_classes = fake_data(batch_size, None, height=28, width=28, depth=1, num_classes=10) y_ = tf.one_hot(y_, num_classes) x = tf.reshape(x_image, [-1, 784]) @@ -73,6 +75,8 @@ def conv2d(x, W): def max_pool_2x2(x): return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') + batch_size = tfhelper.batch_size_from_env(batch_size) + print('Using batch_size {}'.format(batch_size)) x_image, y_, num_classes = fake_data(batch_size, None, height=28, width=28, depth=1, num_classes=10) y_ = tf.one_hot(y_, num_classes) keep_prob = tf.placeholder(tf.float32) @@ -150,6 +154,8 @@ def conv2d(x, W): def max_pool_2x2(x): return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') + batch_size = tfhelper.batch_size_from_env(batch_size) + print('Using batch_size {}'.format(batch_size)) x_image, y_, num_classes = fake_data(batch_size, None, height=28, width=28, depth=1, num_classes=10) y_ = tf.one_hot(y_, num_classes) keep_prob = tf.placeholder(tf.float32) diff --git a/tests/test_tf/test_seq.py b/tests/test_tf/test_seq.py index 64edbbb..971d558 100644 --- a/tests/test_tf/test_seq.py +++ b/tests/test_tf/test_seq.py @@ -19,10 +19,12 @@ def run_seq_ptb(sess, config_name): eval_config = get_config(config_name) config.max_max_epoch = 1 config.max_epoch = 1 + config.batch_size = tfhelper.batch_size_from_env(config.batch_size) + print("Using batch size {}".format(config.batch_size)) eval_config.max_max_epoch = 1 eval_config.max_epoch = 1 - eval_config.batch_size = 1 + eval_config.batch_size = config.batch_size eval_config.num_steps = 1 train_input, valid_input, test_input = datasets.ptb_data(config, eval_config) @@ -65,10 +67,12 @@ def test_seq_ptb(sess, config_name): eval_config = get_config(config_name) config.max_max_epoch = 1 config.max_epoch = 1 + config.batch_size = tfhelper.batch_size_from_env(config.batch_size) + print("Using batch size {}".format(config.batch_size)) eval_config.max_max_epoch = 1 eval_config.max_epoch = 1 - eval_config.batch_size = 1 + eval_config.batch_size = config.batch_size eval_config.num_steps = 1 train_input, valid_input, test_input = datasets.ptb_data(config, eval_config) diff --git a/tests/test_tf/test_super_res.py b/tests/test_tf/test_super_res.py index fff5fa2..7dfd669 100644 --- a/tests/test_tf/test_super_res.py +++ b/tests/test_tf/test_super_res.py @@ -20,6 +20,7 @@ def run_superres(sess, input_data, batch_size=100, isEval=False): batch_size = tfhelper.batch_size_from_env(batch_size) + print("{}: Using batch size {}".format(datetime.now(), batch_size)) input_images, target_images = input_data(batch_size=batch_size)