On PR #1999
GitHub Actions / TT-Forge-FE Tests
failed
Jan 13, 2025 in 0s
689 tests run, 423 passed, 263 skipped, 3 failed.
Annotations
Check failure on line 1 in forge/test/mlir/llama/tests/test_specific_ops_llama32.py
github-actions / TT-Forge-FE Tests
test_specific_ops_llama32.test_matmul[shapes6]
[XPASS(strict)] pcc ~ 0.65
Raw output
[XPASS(strict)] pcc ~ 0.65
Check failure on line 149 in forge/test/mlir/mnist/training/test_training.py
github-actions / TT-Forge-FE Tests
test_training.test_mnist_training_with_grad_accumulation
assert False
+ where False = <built-in method allclose of type object at 0x7f8730096480>(tensor(0.54975, grad_fn=<DivBackward1>), tensor(0.52199, grad_fn=<DivBackward1>), rtol=0.05)
+ where <built-in method allclose of type object at 0x7f8730096480> = torch.allclose
Raw output
@pytest.mark.push
def test_mnist_training_with_grad_accumulation():
torch.manual_seed(0)
# Config
num_epochs = 3
batch_size = 1
learning_rate = 0.001
# Limit number of batches to run - quicker test
limit_num_batches = 1000
# Load dataset
test_loader, train_loader = load_dataset(batch_size)
# Define model and instruct it to compile and run on TT device
framework_model = MNISTLinear(bias=False) # bias=False because batch_size=1 with bias=True is not supported
# Create a torch loss and leave on CPU
loss_fn = torch.nn.CrossEntropyLoss()
# Define optimizer and instruct it to compile and run on TT device
framework_optimizer = torch.optim.SGD(framework_model.parameters(), lr=learning_rate)
tt_model = forge.compile(framework_model, sample_inputs=[torch.rand(batch_size, 784)], training=True)
logger.info("Starting training loop... (logger will be disabled)")
logger.disable("")
for epoch_idx in range(num_epochs):
# Reset gradients (every epoch) - since our batch size is currently 1,
# we accumulate gradients across multiple batches (limit_num_batches),
# and then run the optimizer.
framework_optimizer.zero_grad()
total_loss = 0
for batch_idx, (data, target) in enumerate(train_loader):
# Create target tensor and leave on CPU
target = nn.functional.one_hot(target, num_classes=10).float()
# Forward pass (prediction) on device
pred = tt_model(data)[0]
golden_pred = framework_model(data)
assert compare_with_golden(golden_pred, pred, pcc=0.95)
# Compute loss on CPU
loss = loss_fn(pred, target)
total_loss += loss.item()
golden_loss = loss_fn(golden_pred, target)
> assert torch.allclose(loss, golden_loss, rtol=5e-2) # 5% tolerance
E assert False
E + where False = <built-in method allclose of type object at 0x7f8730096480>(tensor(0.54975, grad_fn=<DivBackward1>), tensor(0.52199, grad_fn=<DivBackward1>), rtol=0.05)
E + where <built-in method allclose of type object at 0x7f8730096480> = torch.allclose
forge/test/mlir/mnist/training/test_training.py:149: AssertionError
Check failure on line 1 in forge/test/mlir/resnet/test_resnet_unique_ops.py
github-actions / TT-Forge-FE Tests
test_resnet_unique_ops.test_matmul_resnet[1-1000-2048]
[XPASS(strict)] Tensor mismatch. PCC = 0.9425581505871167, but required = 0.99. Tracking on: https://github.com/tenstorrent/tt-mlir/issues/1576
Raw output
[XPASS(strict)] Tensor mismatch. PCC = 0.9425581505871167, but required = 0.99. Tracking on: https://github.com/tenstorrent/tt-mlir/issues/1576
Loading