From cf080840d0d01dacd076d6089c44fcac22942ef7 Mon Sep 17 00:00:00 2001 From: KexingLi22 Date: Mon, 22 Jul 2024 14:24:57 +0200 Subject: [PATCH] Update: write and read the model, train and predict perspectively --- .../sysds/test/gpu/multigpu/GPUTest.java | 21 +++++- .../test/gpu/multigpu/SingleGPUTest.java | 3 + src/test/scripts/gpu/InferenceScript.dml | 73 +++++++++++++++++++ src/test/scripts/gpu/TrainScript.dml | 45 ++++++++++++ 4 files changed, 141 insertions(+), 1 deletion(-) create mode 100644 src/test/scripts/gpu/InferenceScript.dml create mode 100644 src/test/scripts/gpu/TrainScript.dml diff --git a/src/test/java/org/apache/sysds/test/gpu/multigpu/GPUTest.java b/src/test/java/org/apache/sysds/test/gpu/multigpu/GPUTest.java index 89c1558d860..32f4234ed87 100644 --- a/src/test/java/org/apache/sysds/test/gpu/multigpu/GPUTest.java +++ b/src/test/java/org/apache/sysds/test/gpu/multigpu/GPUTest.java @@ -40,7 +40,8 @@ public abstract class GPUTest extends AutomatedTestBase { protected static final String TEST_CLASS_DIR = TEST_DIR + MultiGPUTest.class.getSimpleName() + "/"; protected static final String SINGLE_GPU_TEST = "SingleGPUTest"; protected static final String MULTI_GPUS_TEST = "MultiGPUsTest"; - protected static final String TEST_NAME = "GPUTest"; + protected static final String TEST_NAME = "InferenceScript"; + protected static final String TRAIN_SCRIPT = "TrainScript"; protected static final String DATA_SET = DATASET_DIR + "MNIST/mnist_test.csv"; protected static final String SINGLE_TEST_CONFIG = CONFIG_DIR + "SystemDS-SingleGPU-config.xml"; protected static final String MULTI_TEST_CONFIG = CONFIG_DIR + "SystemDS-config.xml"; @@ -90,6 +91,24 @@ protected void runMultiGPUsTest(boolean multiGPUs, int numTestImages) { appender.clearLogMessages(); } + /** + * Run the training script + */ + protected void runTrainingScript(boolean multiGPUs, int numTestImages) { + getAndLoadTestConfiguration(multiGPUs ? MULTI_GPUS_TEST : SINGLE_GPU_TEST); + + String HOME = SCRIPT_DIR + TEST_DIR; + fullDMLScriptName = HOME + TRAIN_SCRIPT + ".dml"; + programArgs = new String[] { "-args", DATA_SET, output("R"), Integer.toString(numTestImages), "-config", + multiGPUs ? MULTI_TEST_CONFIG : SINGLE_TEST_CONFIG }; + fullRScriptName = HOME + TEST_NAME + ".R"; + + rCmd = null; + InMemoryAppender appender = configureLog4j(); + + runTest(true, false, null, -1); + } + protected static InMemoryAppender configureLog4j() { Logger rootLogger = Logger.getRootLogger(); rootLogger.setLevel(Level.ERROR); diff --git a/src/test/java/org/apache/sysds/test/gpu/multigpu/SingleGPUTest.java b/src/test/java/org/apache/sysds/test/gpu/multigpu/SingleGPUTest.java index 9fc5fedbbe6..51a45737aba 100644 --- a/src/test/java/org/apache/sysds/test/gpu/multigpu/SingleGPUTest.java +++ b/src/test/java/org/apache/sysds/test/gpu/multigpu/SingleGPUTest.java @@ -64,6 +64,9 @@ public void test01_gpuTest_500k() { @Override protected void runMultiGPUsTest(boolean multiGPUs, int numTestImages) { + // Train the model first + super.runTrainingScript(multiGPUs, numTestImages); + long startTime = System.nanoTime(); super.runMultiGPUsTest(multiGPUs, numTestImages); long endTime = System.nanoTime(); diff --git a/src/test/scripts/gpu/InferenceScript.dml b/src/test/scripts/gpu/InferenceScript.dml new file mode 100644 index 00000000000..5321653b754 --- /dev/null +++ b/src/test/scripts/gpu/InferenceScript.dml @@ -0,0 +1,73 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +source("src/test/scripts/applications/nn/component/efficientNet.dml") as eff + +# Read training data +data = read($1, format="csv") + +N = nrow(data) + +# Extract images and labels +images = data[,2:ncol(data)] +labels = data[,1] + +# Scale images to [0,1], and one-hot encode the labels +images = images / 255.0 +labels = table(seq(1, N), labels+1, N, 10) + +# Load the trained model +model = read("output/model.txt", format="text") + +# Generate random data for predicting +num_test_images = as.integer($3) +test_images = rand(rows=num_test_images, cols=28*28, min=0, max=1, sparsity=1.0, pdf="uniform", seed=42) +test_labels = rand(rows=num_test_images, cols=1, min=0, max=9, sparsity=1.0, pdf="uniform", seed=42) +test_labels = round(test_labels) + +# One-hot encode the test labels +test_labels = table(seq(1, num_test_images), test_labels+1, num_test_images, 10) + +# Materialize intermediates by computing and printing their sums +print("Sum of test_images: " + sum(test_images)) +print("Sum of test_labels: " + sum(test_labels)) + +# Also Predict in Batches since otherwise we can run into Memory Issues +# Could be unnecessary on more powerful machines :) +batch_size = 1024 # Adjust the batch size to a larger value +iters = ceil(num_test_images / batch_size) +partial_accuracies = matrix(0, rows=iters, cols=1) + +# Start timing the parfor loop +parfor_start_time = time() +parfor(i in 1:iters) { + beg = ((i-1) * batch_size) %% num_test_images + 1 + end = min(num_test_images, beg + batch_size - 1) + X_batch = test_images[beg:end,] + y_batch = test_labels[beg:end,] + + pred = eff::netPredict(X_batch, model, 1, 28, 28) + partial_accuracies[i,1] = mean(rowIndexMax(pred) == rowIndexMax(y_batch)) +} +# End timing the parfor loop +parfor_end_time = time() +parfor_execution_time = floor((parfor_end_time-parfor_start_time)/1000000000) +print("Parfor Execution Time: " + parfor_execution_time) diff --git a/src/test/scripts/gpu/TrainScript.dml b/src/test/scripts/gpu/TrainScript.dml new file mode 100644 index 00000000000..c0711398ab7 --- /dev/null +++ b/src/test/scripts/gpu/TrainScript.dml @@ -0,0 +1,45 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +source("src/test/scripts/applications/nn/component/efficientNet.dml") as eff + +# Read training data +data = read($1, format="csv") + +N = nrow(data) + +# Extract images and labels +images = data[,2:ncol(data)] +labels = data[,1] + +# Scale images to [0,1], and one-hot encode the labels +images = images / 255.0 +labels = table(seq(1, N), labels+1, N, 10) + +# Initialize and train the model +model = eff::initNetwork(1, 10, -1) +epochs = 1 +batch_size = 256 +model = eff::netTrain(model, images, 1, 28, 28, labels, epochs, batch_size, 0.025, 0.9, TRUE) + +# Save the trained model to disk +write(model, "output/model.txt", format="text") +print("Trained model saved to output/model.bin") \ No newline at end of file