Skip to content

Commit

Permalink
Fixed head offset in follower after a snapshot load (#415)
Browse files Browse the repository at this point in the history
The head snapshot in follower was not updated after loading a snapshot.
This can make it look like the follower is not caught up with the
leader, when in fact it is.

```
Dec 27 22:43:34.180 |INF| Successfully moved ensemble to a new term component='shard-controller' followers=[{ server-address={ public='oxia-2.oxia-svc.oxia.svc.cluster.local:6648' internal='oxia-2.oxia-sv
c:6649' } entry-id={ term=-1 offset=-1 } },{ server-address={ public='oxia-1.oxia-svc.oxia.svc.cluster.local:6648' internal='oxia-1.oxia-svc:6649' } entry-id={ term=-1 offset=-1 } }] namespace='default' n
ew-leader={ public='oxia-5.oxia-svc.oxia.svc.cluster.local:6648' internal='oxia-5.oxia-svc:6649' } shard=1 term=19
Dec 27 22:43:34.229 |INF| Successfully deleted shard component='shard-controller' namespace='default' server={ public='oxia-4.oxia-svc.oxia.svc.cluster.local:6648' internal='oxia-4.oxia-svc:6649' } shard=
1
Dec 27 22:43:34.233 |INF| Elected new leader component='shard-controller' leader={ public='oxia-5.oxia-svc.oxia.svc.cluster.local:6648' internal='oxia-5.oxia-svc:6649' } namespace='default' shard=1 term=1
9
Dec 27 22:43:34.241 |INF| Follower is *not* caught-up yet with the leader component='shard-controller' follower-head-offset=-1 leader-head-offset=27556 namespace='default' server={ public='oxia-2.oxia-svc
.oxia.svc.cluster.local:6648' internal='oxia-2.oxia-svc:6649' } shard=1
Dec 27 22:43:34.324 |INF| Follower is *not* caught-up yet with the leader component='shard-controller' follower-head-offset=-1 leader-head-offset=27556 namespace='default' server={ public='oxia-2.oxia-svc
.oxia.svc.cluster.local:6648' internal='oxia-2.oxia-svc:6649' } shard=1
Dec 27 22:43:34.406 |INF| Follower is *not* caught-up yet with the leader component='shard-controller' follower-head-offset=-1 leader-head-offset=27556 namespace='default' server={ public='oxia-2.oxia-svc
.oxia.svc.cluster.local:6648' internal='oxia-2.oxia-svc:6649' } shard=1
Dec 27 22:43:34.672 |INF| Follower is *not* caught-up yet with the leader component='shard-controller' follower-head-offset=-1 leader-head-offset=27556 namespace='default' server={ public='oxia-2.oxia-svc
.oxia.svc.cluster.local:6648' internal='oxia-2.oxia-svc:6649' } shard=1
Dec 27 22:43:34.922 |INF| Follower is *not* caught-up yet with the leader component='shard-controller' follower-head-offset=-1 leader-head-offset=27556 namespace='default' server={ public='oxia-2.oxia-svc
.oxia.svc.cluster.local:6648' internal='oxia-2.oxia-svc:6649' } shard=1
Dec 27 22:43:35.621 |INF| Follower is *not* caught-up yet with the leader component='shard-controller' follower-head-offset=-1 leader-head-offset=27556 namespace='default' server={ public='oxia-2.oxia-svc
.oxia.svc.cluster.local:6648' internal='oxia-2.oxia-svc:6649' } shard=1
Dec 27 22:43:36.113 |INF| Follower is *not* caught-up yet with the leader component='shard-controller' follower-head-offset=-1 leader-head-offset=27556 namespace='default' server={ public='oxia-2.oxia-svc
.oxia.svc.cluster.local:6648' internal='oxia-2.oxia-svc:6649' } shard=1
```

The node had indeed correctly applied the snapshot:

```
Dec 27 22:38:34.193 |INF| Received SendSnapshot request component='internal-rpc-server' namespace='default' peer='10.244.0.130:50430' shard=1
Dec 27 22:38:34.193 |INF| Closing leader controller component='leader-controller' namespace='default' shard=1 term=18
Dec 27 22:38:34.194 |INF| [JOB 1] WAL file /data/db/default/shard-1/000002.log with log number 000002 stopped reading at offset: 0; replayed 0 keys in 0 batches component='pebble' shard=1
Dec 27 22:38:34.196 |INF| Created follower commit-offset=-1 component='follower-controller' head-offset=-1 namespace='default' shard=1 term=18
Dec 27 22:38:34.208 |INF| Successfully applied snapshot commit-offset=27556 component='follower-controller' namespace='default' shard=1 snapshot-size=449137 term=18
Dec 27 22:38:34.220 |INF| Received Replicate request component='internal-rpc-server' namespace='default' peer='10.244.0.130:50430' shard=1
```
  • Loading branch information
merlimat authored Dec 28, 2023
1 parent d7f48c7 commit 174b459
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 2 deletions.
10 changes: 8 additions & 2 deletions server/follower_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,11 @@ func NewFollowerController(config Config, namespace string, shardId int64, wf wa
}
fc.commitOffset.Store(commitOffset)

if fc.lastAppendedOffset == wal.InvalidOffset {
// The wal is empty, though we have restored from snapshot
fc.lastAppendedOffset = commitOffset
}

fc.setLogger()

go common.DoWithLabels(
Expand All @@ -161,7 +166,7 @@ func NewFollowerController(config Config, namespace string, shardId int64, wf wa

fc.log.Info(
"Created follower",
slog.Int64("head-offset", fc.wal.LastOffset()),
slog.Int64("head-offset", fc.lastAppendedOffset),
slog.Int64("commit-offset", commitOffset),
)
return fc, nil
Expand Down Expand Up @@ -701,6 +706,7 @@ func (fc *followerController) handleSnapshot(stream proto.OxiaLogReplication_Sen

fc.db = newDb
fc.commitOffset.Store(commitOffset)
fc.lastAppendedOffset = commitOffset
fc.closeStreamNoMutex(nil)

fc.log.Info(
Expand All @@ -718,7 +724,7 @@ func (fc *followerController) GetStatus(_ *proto.GetStatusRequest) (*proto.GetSt
return &proto.GetStatusResponse{
Term: fc.term,
Status: fc.status,
HeadOffset: fc.wal.LastOffset(),
HeadOffset: fc.lastAppendedOffset,
CommitOffset: fc.CommitOffset(),
}, nil
}
Expand Down
24 changes: 24 additions & 0 deletions server/follower_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -650,6 +650,15 @@ func TestFollower_HandleSnapshot(t *testing.T) {
// Wait for follower to fully load the snapshot
wg.Wait()

statusRes, err := fc.(*followerController).GetStatus(&proto.GetStatusRequest{
ShardId: shardId,
})
assert.NoError(t, err)
assert.Equal(t, proto.ServingStatus_FOLLOWER, statusRes.Status)
assert.EqualValues(t, 1, statusRes.Term)
assert.EqualValues(t, 99, statusRes.HeadOffset)
assert.EqualValues(t, 99, statusRes.CommitOffset)

// At this point the content of the follower should only include the
// data from the snapshot and any existing data should be gone

Expand Down Expand Up @@ -681,6 +690,21 @@ func TestFollower_HandleSnapshot(t *testing.T) {

assert.Equal(t, wal.InvalidOffset, fc.(*followerController).wal.LastOffset())

assert.NoError(t, fc.Close())

// Re-Open the follower controller
fc, err = NewFollowerController(Config{}, common.DefaultNamespace, shardId, walFactory, kvFactory)
assert.NoError(t, err)

statusRes, err = fc.(*followerController).GetStatus(&proto.GetStatusRequest{
ShardId: shardId,
})
assert.NoError(t, err)
assert.Equal(t, proto.ServingStatus_FENCED, statusRes.Status)
assert.EqualValues(t, 1, statusRes.Term)
assert.EqualValues(t, 99, statusRes.HeadOffset)
assert.EqualValues(t, 99, statusRes.CommitOffset)

assert.NoError(t, fc.Close())
assert.NoError(t, kvFactory.Close())
assert.NoError(t, walFactory.Close())
Expand Down

0 comments on commit 174b459

Please sign in to comment.