Skip to content

Commit

Permalink
Update: write and read the model, train and predict perspectively
Browse files Browse the repository at this point in the history
  • Loading branch information
KexingLi22 committed Jul 22, 2024
1 parent 9001604 commit cf08084
Show file tree
Hide file tree
Showing 4 changed files with 141 additions and 1 deletion.
21 changes: 20 additions & 1 deletion src/test/java/org/apache/sysds/test/gpu/multigpu/GPUTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ public abstract class GPUTest extends AutomatedTestBase {
protected static final String TEST_CLASS_DIR = TEST_DIR + MultiGPUTest.class.getSimpleName() + "/";
protected static final String SINGLE_GPU_TEST = "SingleGPUTest";
protected static final String MULTI_GPUS_TEST = "MultiGPUsTest";
protected static final String TEST_NAME = "GPUTest";
protected static final String TEST_NAME = "InferenceScript";
protected static final String TRAIN_SCRIPT = "TrainScript";
protected static final String DATA_SET = DATASET_DIR + "MNIST/mnist_test.csv";
protected static final String SINGLE_TEST_CONFIG = CONFIG_DIR + "SystemDS-SingleGPU-config.xml";
protected static final String MULTI_TEST_CONFIG = CONFIG_DIR + "SystemDS-config.xml";
Expand Down Expand Up @@ -90,6 +91,24 @@ protected void runMultiGPUsTest(boolean multiGPUs, int numTestImages) {
appender.clearLogMessages();
}

/**
* Run the training script
*/
protected void runTrainingScript(boolean multiGPUs, int numTestImages) {
getAndLoadTestConfiguration(multiGPUs ? MULTI_GPUS_TEST : SINGLE_GPU_TEST);

String HOME = SCRIPT_DIR + TEST_DIR;
fullDMLScriptName = HOME + TRAIN_SCRIPT + ".dml";
programArgs = new String[] { "-args", DATA_SET, output("R"), Integer.toString(numTestImages), "-config",
multiGPUs ? MULTI_TEST_CONFIG : SINGLE_TEST_CONFIG };
fullRScriptName = HOME + TEST_NAME + ".R";

rCmd = null;
InMemoryAppender appender = configureLog4j();

runTest(true, false, null, -1);
}

protected static InMemoryAppender configureLog4j() {
Logger rootLogger = Logger.getRootLogger();
rootLogger.setLevel(Level.ERROR);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ public void test01_gpuTest_500k() {

@Override
protected void runMultiGPUsTest(boolean multiGPUs, int numTestImages) {
// Train the model first
super.runTrainingScript(multiGPUs, numTestImages);

long startTime = System.nanoTime();
super.runMultiGPUsTest(multiGPUs, numTestImages);
long endTime = System.nanoTime();
Expand Down
73 changes: 73 additions & 0 deletions src/test/scripts/gpu/InferenceScript.dml
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------

source("src/test/scripts/applications/nn/component/efficientNet.dml") as eff

# Read training data
data = read($1, format="csv")

N = nrow(data)

# Extract images and labels
images = data[,2:ncol(data)]
labels = data[,1]

# Scale images to [0,1], and one-hot encode the labels
images = images / 255.0
labels = table(seq(1, N), labels+1, N, 10)

# Load the trained model
model = read("output/model.txt", format="text")

# Generate random data for predicting
num_test_images = as.integer($3)
test_images = rand(rows=num_test_images, cols=28*28, min=0, max=1, sparsity=1.0, pdf="uniform", seed=42)
test_labels = rand(rows=num_test_images, cols=1, min=0, max=9, sparsity=1.0, pdf="uniform", seed=42)
test_labels = round(test_labels)

# One-hot encode the test labels
test_labels = table(seq(1, num_test_images), test_labels+1, num_test_images, 10)

# Materialize intermediates by computing and printing their sums
print("Sum of test_images: " + sum(test_images))
print("Sum of test_labels: " + sum(test_labels))

# Also Predict in Batches since otherwise we can run into Memory Issues
# Could be unnecessary on more powerful machines :)
batch_size = 1024 # Adjust the batch size to a larger value
iters = ceil(num_test_images / batch_size)
partial_accuracies = matrix(0, rows=iters, cols=1)

# Start timing the parfor loop
parfor_start_time = time()
parfor(i in 1:iters) {
beg = ((i-1) * batch_size) %% num_test_images + 1
end = min(num_test_images, beg + batch_size - 1)
X_batch = test_images[beg:end,]
y_batch = test_labels[beg:end,]

pred = eff::netPredict(X_batch, model, 1, 28, 28)
partial_accuracies[i,1] = mean(rowIndexMax(pred) == rowIndexMax(y_batch))
}
# End timing the parfor loop
parfor_end_time = time()
parfor_execution_time = floor((parfor_end_time-parfor_start_time)/1000000000)
print("Parfor Execution Time: " + parfor_execution_time)
45 changes: 45 additions & 0 deletions src/test/scripts/gpu/TrainScript.dml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------

source("src/test/scripts/applications/nn/component/efficientNet.dml") as eff

# Read training data
data = read($1, format="csv")

N = nrow(data)

# Extract images and labels
images = data[,2:ncol(data)]
labels = data[,1]

# Scale images to [0,1], and one-hot encode the labels
images = images / 255.0
labels = table(seq(1, N), labels+1, N, 10)

# Initialize and train the model
model = eff::initNetwork(1, 10, -1)
epochs = 1
batch_size = 256
model = eff::netTrain(model, images, 1, 28, 28, labels, epochs, batch_size, 0.025, 0.9, TRUE)

# Save the trained model to disk
write(model, "output/model.txt", format="text")
print("Trained model saved to output/model.bin")

0 comments on commit cf08084

Please sign in to comment.