Skip to content

Commit

Permalink
GSM: split OnlyBootstrap into PreSyncing and Syncing (#975)
Browse files Browse the repository at this point in the history
This PR splits the previous OnlyBootstrap state of the GSM into separate
PreSyncing and Syncing states, making it ready to be used for Genesis,
while allowing to still use the GSM in conjunction with bootstrap peers.

Compared to the [Bootstrap Peers
IER](https://ouroboros-consensus.cardano.intersectmbo.org/docs/for-developers/BootstrapPeersIER),
there is one more state and two more transition rules, both based on the
Honest Availability Assumption (HHA), which the Network layer will try
to establish and inform us about, likely by ensuring a minimum amount of
(big) ledger peers, see `isHaaSatisfied` in `GsmView`.

The new state transition diagram looks like this:

```mermaid
graph
    PreSyncing[PreSyncing]
    Syncing[Syncing]
    CaughtUp[CaughtUp]

    PreSyncing -- "Honest Availability Assumption\nis satisfied" --> Syncing
    Syncing -- "Honest Availability Assumption\nis no longer satisfied" --> PreSyncing
    Syncing -- "no peers claim to have\nsubsequent headers,\nand its selection is ≥\nthe best header" --> CaughtUp
    CaughtUp -- "vol tip became older than X" --> PreSyncing

    StartUp[[Node start-up]]
    StartUp -- "node was most recently in CaughtUp\nand vol tip is younger than X" --> CaughtUp
    StartUp -- "otherwise" --> PreSyncing
```
  • Loading branch information
amesgen authored Mar 25, 2024
2 parents 488bfc6 + 83cd2fe commit 0650fe2
Show file tree
Hide file tree
Showing 5 changed files with 287 additions and 136 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
### Breaking

- GSM: split OnlyBootstrap into PreSyncing and Syncing
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,13 @@ module Ouroboros.Consensus.Node.GSM (
, DurationFromNow (..)
, GsmEntryPoints (..)
, GsmNodeKernelArgs (..)
, GsmState (..)
, GsmView (..)
, MarkerFileView (..)
, WrapDurationUntilTooOld (..)
-- * Auxiliaries
, TraceGsmEvent (..)
, gsmStateToLedgerJudgement
, initializationLedgerJudgement
-- * Constructors
, realDurationUntilTooOld
Expand All @@ -27,7 +29,8 @@ module Ouroboros.Consensus.Node.GSM (
import qualified Cardano.Slotting.Slot as Slot
import qualified Control.Concurrent.Class.MonadSTM.TVar as LazySTM
import Control.Monad (forever, join, unless)
import Control.Monad.Class.MonadSTM (MonadSTM, STM, atomically, check)
import Control.Monad.Class.MonadSTM (MonadSTM, STM, atomically, check,
orElse)
import Control.Monad.Class.MonadThrow (MonadThrow)
import Control.Monad.Class.MonadTimer (threadDelay)
import qualified Control.Monad.Class.MonadTimer.SI as SI
Expand Down Expand Up @@ -78,11 +81,24 @@ data CandidateVersusSelection =
-- ^ Whether the candidate is better than the selection
deriving (Eq, Show)

-- | Current state of the Genesis State Machine
data GsmState =
PreSyncing
-- ^ We are syncing, and the Honest Availability Assumption is not
-- satisfied.
|
Syncing
-- ^ We are syncing, and the Honest Availability Assumption is satisfied.
|
CaughtUp
-- ^ We are caught-up.
deriving (Eq, Show, Read)

data GsmView m upstreamPeer selection candidate = GsmView {
antiThunderingHerd :: Maybe StdGen
-- ^ An initial seed used to randomly increase 'minCaughtUpDuration' by up
-- to 15% every transition from OnlyBootstrap to CaughtUp, in order to
-- avoid a thundering herd phenemenon.
-- to 15% every transition from Syncing to CaughtUp, in order to avoid a
-- thundering herd phenomenon.
--
-- 'Nothing' should only be used for testing.
,
Expand Down Expand Up @@ -112,33 +128,39 @@ data GsmView m upstreamPeer selection candidate = GsmView {
,
minCaughtUpDuration :: NominalDiffTime
-- ^ How long the node must stay in CaughtUp after transitioning to it from
-- OnlyBootstrap, regardless of the selection's age. This prevents the
-- whole network from thrashing between CaughtUp and OnlyBootstrap if
-- there's an outage in block production.
-- Syncing, regardless of the selection's age. This prevents the whole
-- network from thrashing between CaughtUp and (Pre)Syncing if there's an
-- outage in block production.
--
-- See also 'antiThunderingHerd'.
,
setCaughtUpPersistentMark :: Bool -> m ()
-- ^ EG touch/delete the marker file on disk
,
writeLedgerStateJudgement :: LedgerStateJudgement -> m ()
-- ^ EG update the TVar that the Diffusion Layer monitors
writeGsmState :: GsmState -> m ()
-- ^ EG update the TVar that the Diffusion Layer monitors, or en-/disable
-- certain components of Genesis
,
isHaaSatisfied :: STM m Bool
-- ^ Whether the Honest Availability Assumption is currently satisfied. This
-- is used as the trigger for transitioning from 'PreSyncing' to 'Syncing'
-- and vice versa.
}

-- | The two proper GSM states for boot strap peers
-- | The two proper GSM entrypoints.
--
-- See the @BootstrapPeersIER.md@ document for their specification.
-- See the @BootstrapPeersIER.md@ document for documentation.
--
-- See 'initializationLedgerJudgement' for the @Initializing@ pseudo-state.
data GsmEntryPoints m = GsmEntryPoints {
enterCaughtUp :: forall neverTerminates. m neverTerminates
enterCaughtUp :: forall neverTerminates. m neverTerminates
-- ^ ASSUMPTION the marker file is present on disk, a la
-- @'setCaughtUpPersistentMark' True@
--
-- Thus this can be invoked at node start up after determining the marker
-- file is present (and the tip is still not stale)
,
enterOnlyBootstrap :: forall neverTerminates. m neverTerminates
enterPreSyncing :: forall neverTerminates. m neverTerminates
-- ^ ASSUMPTION the marker file is absent on disk, a la
-- @'setCaughtUpPersistentMark' False@
--
Expand Down Expand Up @@ -177,13 +199,28 @@ initializationLedgerJudgement
removeMarkerFile markerFileView
return TooOld

-- | For 'LedgerStateJudgement' as used in the Diffusion layer, there is no
-- difference between 'PreSyncing' and 'Syncing'.
gsmStateToLedgerJudgement :: GsmState -> LedgerStateJudgement
gsmStateToLedgerJudgement = \case
PreSyncing -> TooOld
Syncing -> TooOld
CaughtUp -> YoungEnough

{-------------------------------------------------------------------------------
A real implementation
-------------------------------------------------------------------------------}

-- | The actual GSM logic for boot strap peers
--
-- See the @BootstrapPeersIER.md@ document for the specification of this logic.
-- See the @BootstrapPeersIER.md@ document for the specification of most of this
-- logic, except the transition rules between PreSyncing and Syncing, the two
-- states OnlyBootstrap is split into:
--
-- - PreSyncing ⟶ Syncing: The Honest Availability Assumption is satisfied.
--
--- - Syncing ⟶ PreSyncing: The Honest Availability Assumption is no longer
--- satisfied.
realGsmEntryPoints :: forall m upstreamPeer selection tracedSelection candidate.
( SI.MonadDelay m
, SI.MonadTimer m
Expand All @@ -195,7 +232,7 @@ realGsmEntryPoints :: forall m upstreamPeer selection tracedSelection candidate.
realGsmEntryPoints tracerArgs gsmView = GsmEntryPoints {
enterCaughtUp
,
enterOnlyBootstrap
enterPreSyncing
}
where
(cnvSelection, tracer) = tracerArgs
Expand All @@ -219,39 +256,62 @@ realGsmEntryPoints tracerArgs gsmView = GsmEntryPoints {
,
setCaughtUpPersistentMark
,
writeLedgerStateJudgement
writeGsmState
,
isHaaSatisfied
} = gsmView

enterCaughtUp :: forall neverTerminates. m neverTerminates
enterCaughtUp = enterCaughtUp' antiThunderingHerd

enterOnlyBootstrap :: forall neverTerminates. m neverTerminates
enterOnlyBootstrap = enterOnlyBootstrap' antiThunderingHerd
enterPreSyncing :: forall neverTerminates. m neverTerminates
enterPreSyncing = enterPreSyncing' antiThunderingHerd

enterCaughtUp' :: forall neverTerminates. Maybe StdGen -> m neverTerminates
enterCaughtUp' g = do
(g', ev) <- blockWhileCaughtUp g

setCaughtUpPersistentMark False
writeLedgerStateJudgement TooOld
writeGsmState PreSyncing
traceWith tracer ev

enterOnlyBootstrap' g'
enterPreSyncing' g'

enterOnlyBootstrap' :: Maybe StdGen -> forall neverTerminates. m neverTerminates
enterOnlyBootstrap' g = do
ev <- blockUntilCaughtUp
enterPreSyncing' :: Maybe StdGen -> forall neverTerminates. m neverTerminates
enterPreSyncing' g = do
blockUntilHonestAvailabilityAssumption

writeLedgerStateJudgement YoungEnough
setCaughtUpPersistentMark True
traceWith tracer ev
writeGsmState Syncing
traceWith tracer GsmEventPreSyncingToSyncing

-- When transitioning from OnlyBootstrap to CaughtUp, the node will
-- remain in CaughtUp for at least 'minCaughtUpDuration', regardless of
-- the selection's age.
SI.threadDelay $ realToFrac minCaughtUpDuration
enterSyncing' g

enterCaughtUp' g
enterSyncing' :: Maybe StdGen -> forall neverTerminates. m neverTerminates
enterSyncing' g = do
-- Wait until either the Honest Availability Assumption is no longer
-- satisfied, or we are caught up.
mev <- atomically $
(Nothing <$ blockWhileHonestAvailabilityAssumption)
`orElse`
(Just <$> blockUntilCaughtUp)

case mev of
Nothing -> do
writeGsmState PreSyncing
traceWith tracer GsmEventSyncingToPreSyncing

enterPreSyncing' g
Just ev -> do
writeGsmState CaughtUp
setCaughtUpPersistentMark True
traceWith tracer ev

-- When transitioning from Syncing to CaughtUp, the node will remain
-- in CaughtUp for at least 'minCaughtUpDuration', regardless of the
-- selection's age.
SI.threadDelay $ realToFrac minCaughtUpDuration

enterCaughtUp' g

blockWhileCaughtUp ::
Maybe StdGen
Expand All @@ -267,8 +327,8 @@ realGsmEntryPoints tracerArgs gsmView = GsmEntryPoints {
-- load out.
--
-- TODO should the Diffusion Layer do this? IE the node /promptly/
-- switches to OnlyBootstrap, but then the Diffusion Layer introces a
-- delay before reaching out to the bootstrap peers?
-- switches to PreSyncing, but then the Diffusion Layer introces a delay
-- before reaching out to the bootstrap peers?
let (bonus, g') = case g of
Nothing -> (0, Nothing) -- it's disabled in some tests
Just x ->
Expand Down Expand Up @@ -311,8 +371,8 @@ realGsmEntryPoints tracerArgs gsmView = GsmEntryPoints {
check $ not $ equivalent selection selection'
pure $ blockWhileCaughtUpHelper bonus selection'

blockUntilCaughtUp :: m (TraceGsmEvent tracedSelection)
blockUntilCaughtUp = atomically $ do
blockUntilCaughtUp :: STM m (TraceGsmEvent tracedSelection)
blockUntilCaughtUp = do
-- STAGE 1: all ChainSync clients report no subsequent headers
idlers <- getChainSyncIdlers
varsCandidate <- getChainSyncCandidates
Expand Down Expand Up @@ -364,12 +424,26 @@ realGsmEntryPoints tracerArgs gsmView = GsmEntryPoints {
-- anticipating. And then the STM validation at the end touches them
-- all one last time. Summary: seems likely to be fast enough.)

blockUntilHonestAvailabilityAssumption :: m ()
blockUntilHonestAvailabilityAssumption =
atomically $ check =<< isHaaSatisfied

blockWhileHonestAvailabilityAssumption :: STM m ()
blockWhileHonestAvailabilityAssumption =
check . not =<< isHaaSatisfied

data TraceGsmEvent selection =
GsmEventEnterCaughtUp !Int !selection
-- ^ how many peers and the current selection
|
GsmEventLeaveCaughtUp !selection !DurationFromNow
-- ^ the current selection and its age
|
GsmEventPreSyncingToSyncing
-- ^ the Honest Availability Assumption is now satisfied
|
GsmEventSyncingToPreSyncing
-- ^ the Honest Availability Assumption is no longer satisfied
deriving (Eq, Show)

{-------------------------------------------------------------------------------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -229,13 +229,16 @@ initNodeKernel args@NodeKernelArgs { registry, cfg, tracers
, GSM.setCaughtUpPersistentMark = \upd ->
(if upd then GSM.touchMarkerFile else GSM.removeMarkerFile)
gsmMarkerFileView
, GSM.writeLedgerStateJudgement = \x -> atomically $ do
writeTVar varLedgerJudgement x
, GSM.writeGsmState = \x -> atomically $ do
writeTVar varLedgerJudgement $ GSM.gsmStateToLedgerJudgement x
, -- In the context of bootstrap peers, it is fine to always
-- return 'True' as all peers are trusted during syncing.
GSM.isHaaSatisfied = pure True
}
judgment <- readTVarIO varLedgerJudgement
void $ forkLinkedThread registry "NodeKernel.GSM" $ case judgment of
TooOld -> GSM.enterOnlyBootstrap gsm
YoungEnough -> GSM.enterCaughtUp gsm
TooOld -> GSM.enterPreSyncing gsm
YoungEnough -> GSM.enterCaughtUp gsm

void $ forkLinkedThread registry "NodeKernel.blockForging" $
blockForgingController st (LazySTM.takeTMVar blockForgingVar)
Expand Down
Loading

0 comments on commit 0650fe2

Please sign in to comment.