From e6eeef785ef34611b7caae8daf8bac61d51d9b77 Mon Sep 17 00:00:00 2001 From: Wade Simmons Date: Mon, 8 May 2023 11:17:14 -0400 Subject: [PATCH 01/18] mutex_debug experimental test to see if we can have a test mode that verifies mutexes lock in the order we want, while having no hit on production performance. Since this uses a build tag, it should all compile out during the build process and be a no-op unless the tag is set. --- Makefile | 3 +++ go.mod | 1 + go.sum | 2 ++ hostmap.go | 6 +++-- mutex.go | 14 +++++++++++ mutex_debug.go | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 90 insertions(+), 2 deletions(-) create mode 100644 mutex.go create mode 100644 mutex_debug.go diff --git a/Makefile b/Makefile index 512fdc2ff..8e771b078 100644 --- a/Makefile +++ b/Makefile @@ -54,6 +54,9 @@ ALL = $(ALL_LINUX) \ e2e: $(TEST_ENV) go test -tags=e2e_testing -count=1 $(TEST_FLAGS) ./e2e +e2e-mutex-debug: + $(TEST_ENV) go test -tags=mutex_debug,e2e_testing -count=1 $(TEST_FLAGS) ./e2e + e2ev: TEST_FLAGS = -v e2ev: e2e diff --git a/go.mod b/go.mod index 52c2e92d1..93fb85d59 100644 --- a/go.mod +++ b/go.mod @@ -42,6 +42,7 @@ require ( github.com/prometheus/common v0.42.0 // indirect github.com/prometheus/procfs v0.9.0 // indirect github.com/rogpeppe/go-internal v1.10.0 // indirect + github.com/timandy/routine v1.1.1 // indirect github.com/vishvananda/netns v0.0.4 // indirect golang.org/x/mod v0.10.0 // indirect golang.org/x/tools v0.8.0 // indirect diff --git a/go.sum b/go.sum index 452a1d237..6758c3d35 100644 --- a/go.sum +++ b/go.sum @@ -140,6 +140,8 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ8= github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +github.com/timandy/routine v1.1.1 h1:6/Z7qLFZj3GrzuRksBFzIG8YGUh8CLhjnnMePBQTrEI= +github.com/timandy/routine v1.1.1/go.mod h1:OZHPOKSvqL/ZvqXFkNZyit0xIVelERptYXdAHH00adQ= github.com/vishvananda/netlink v1.1.0 h1:1iyaYNBLmP6L0220aDnYQpo1QEV4t4hJ+xEEhhJH8j0= github.com/vishvananda/netlink v1.1.0/go.mod h1:cTgwzPIzzgDAYoQrMm0EdrjRUBkTqKYppBueQtXaqoE= github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df/go.mod h1:JP3t17pCcGlemwknint6hfoeCVQrEMVwxRLRjXpq+BU= diff --git a/hostmap.go b/hostmap.go index e5949add2..597d7e40e 100644 --- a/hostmap.go +++ b/hostmap.go @@ -51,7 +51,7 @@ type Relay struct { } type HostMap struct { - sync.RWMutex //Because we concurrently read and write to our maps + syncRWMutex //Because we concurrently read and write to our maps name string Indexes map[uint32]*HostInfo Relays map[uint32]*HostInfo // Maps a Relay IDX to a Relay HostInfo object @@ -197,7 +197,7 @@ func (rs *RelayState) InsertRelay(ip iputil.VpnIp, idx uint32, r *Relay) { } type HostInfo struct { - sync.RWMutex + syncRWMutex remote *udp.Addr remotes *RemoteList @@ -261,6 +261,7 @@ func NewHostMap(l *logrus.Logger, name string, vpnCIDR *net.IPNet, preferredRang r := map[uint32]*HostInfo{} relays := map[uint32]*HostInfo{} m := HostMap{ + syncRWMutex: newSyncRWMutex("hostmap", name), name: name, Indexes: i, Relays: relays, @@ -321,6 +322,7 @@ func (hm *HostMap) AddVpnIp(vpnIp iputil.VpnIp, init func(hostinfo *HostInfo)) ( if h, ok := hm.Hosts[vpnIp]; !ok { hm.RUnlock() h = &HostInfo{ + syncRWMutex: newSyncRWMutex("hostinfo"), vpnIp: vpnIp, HandshakePacket: make(map[uint8][]byte, 0), relayState: RelayState{ diff --git a/mutex.go b/mutex.go new file mode 100644 index 000000000..b7ff88ad3 --- /dev/null +++ b/mutex.go @@ -0,0 +1,14 @@ +//go:build !mutex_debug +// +build !mutex_debug + +package nebula + +import ( + "sync" +) + +type syncRWMutex = sync.RWMutex + +func newSyncRWMutex(t ...string) syncRWMutex { + return sync.RWMutex{} +} diff --git a/mutex_debug.go b/mutex_debug.go new file mode 100644 index 000000000..5f2009246 --- /dev/null +++ b/mutex_debug.go @@ -0,0 +1,66 @@ +//go:build mutex_debug +// +build mutex_debug + +package nebula + +import ( + "strings" + "sync" + + "github.com/timandy/routine" +) + +var threadLocal routine.ThreadLocal = routine.NewThreadLocalWithInitial(func() any { return map[string]bool{} }) + +type syncRWMutex struct { + sync.RWMutex + mutexType string +} + +func newSyncRWMutex(t ...string) syncRWMutex { + return syncRWMutex{ + mutexType: strings.Join(t, "-"), + } +} + +func checkMutex(state map[string]bool, add string) { + if add == "hostinfo" { + if state["hostmap-main"] { + panic("grabbing hostinfo lock and already have hostmap-main") + } + if state["hostmap-pending"] { + panic("grabbing hostinfo lock and already have hostmap-pending") + } + } + if add == "hostmap-pending" { + if state["hostmap-main"] { + panic("grabbing hostmap-pending lock and already have hostmap-main") + } + } +} + +func (s *syncRWMutex) Lock() { + m := threadLocal.Get().(map[string]bool) + checkMutex(m, s.mutexType) + m[s.mutexType] = true + s.RWMutex.Lock() +} + +func (s *syncRWMutex) Unlock() { + m := threadLocal.Get().(map[string]bool) + m[s.mutexType] = false + s.RWMutex.Unlock() +} + +func (s *syncRWMutex) RLock() { + m := threadLocal.Get().(map[string]bool) + checkMutex(m, s.mutexType) + m[s.mutexType] = true + s.RWMutex.RLock() +} + +func (s *syncRWMutex) RUnlock() { + m := threadLocal.Get().(map[string]bool) + m[s.mutexType] = false + s.RWMutex.RUnlock() +} From 3e5e48f937256b0e3e218642c12164d805eb0c85 Mon Sep 17 00:00:00 2001 From: Wade Simmons Date: Tue, 9 May 2023 10:39:28 -0400 Subject: [PATCH 02/18] use mutex_debug during Github Actions run --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 05aff781d..a97b4730b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -44,7 +44,7 @@ jobs: run: make test - name: End 2 end - run: make e2evv + run: make e2e-mutex-debug TEST_LOGS=1 TEST_FLAGS=-v - uses: actions/upload-artifact@v3 with: From 9105eba939ab045662d7c7b467a5508df5ce55d8 Mon Sep 17 00:00:00 2001 From: Wade Simmons Date: Tue, 9 May 2023 11:22:55 -0400 Subject: [PATCH 03/18] also validate hostinfo locks --- hostmap.go | 4 ++-- mutex.go | 8 ++++++- mutex_debug.go | 62 ++++++++++++++++++++++++++++++-------------------- 3 files changed, 46 insertions(+), 28 deletions(-) diff --git a/hostmap.go b/hostmap.go index 597d7e40e..bef6e4ed0 100644 --- a/hostmap.go +++ b/hostmap.go @@ -261,7 +261,7 @@ func NewHostMap(l *logrus.Logger, name string, vpnCIDR *net.IPNet, preferredRang r := map[uint32]*HostInfo{} relays := map[uint32]*HostInfo{} m := HostMap{ - syncRWMutex: newSyncRWMutex("hostmap", name), + syncRWMutex: newSyncRWMutex(mutexKey{Type: "hostmap", SubType: name}), name: name, Indexes: i, Relays: relays, @@ -322,7 +322,7 @@ func (hm *HostMap) AddVpnIp(vpnIp iputil.VpnIp, init func(hostinfo *HostInfo)) ( if h, ok := hm.Hosts[vpnIp]; !ok { hm.RUnlock() h = &HostInfo{ - syncRWMutex: newSyncRWMutex("hostinfo"), + syncRWMutex: newSyncRWMutex(mutexKey{Type: "hostinfo", ID: uint32(vpnIp)}), vpnIp: vpnIp, HandshakePacket: make(map[uint8][]byte, 0), relayState: RelayState{ diff --git a/mutex.go b/mutex.go index b7ff88ad3..181767468 100644 --- a/mutex.go +++ b/mutex.go @@ -9,6 +9,12 @@ import ( type syncRWMutex = sync.RWMutex -func newSyncRWMutex(t ...string) syncRWMutex { +func newSyncRWMutex(mutexKey) syncRWMutex { return sync.RWMutex{} } + +type mutexKey struct { + Type string + SubType string + ID uint32 +} diff --git a/mutex_debug.go b/mutex_debug.go index 5f2009246..c692cb841 100644 --- a/mutex_debug.go +++ b/mutex_debug.go @@ -4,63 +4,75 @@ package nebula import ( - "strings" + "fmt" "sync" "github.com/timandy/routine" ) -var threadLocal routine.ThreadLocal = routine.NewThreadLocalWithInitial(func() any { return map[string]bool{} }) +var threadLocal routine.ThreadLocal = routine.NewThreadLocalWithInitial(func() any { return map[mutexKey]bool{} }) + +type mutexKey struct { + Type string + SubType string + ID uint32 +} type syncRWMutex struct { sync.RWMutex - mutexType string + mutexKey } -func newSyncRWMutex(t ...string) syncRWMutex { +func newSyncRWMutex(key mutexKey) syncRWMutex { return syncRWMutex{ - mutexType: strings.Join(t, "-"), + mutexKey: key, } } -func checkMutex(state map[string]bool, add string) { - if add == "hostinfo" { - if state["hostmap-main"] { - panic("grabbing hostinfo lock and already have hostmap-main") +func checkMutex(state map[mutexKey]bool, add mutexKey) { + switch add.Type { + case "hostinfo": + // Check for any other hostinfo keys: + for k, v := range state { + if k.Type == "hostinfo" && v { + panic(fmt.Errorf("grabbing hostinfo lock and already have a hostinfo lock: state=%v add=%v", state, add)) + } } - if state["hostmap-pending"] { - panic("grabbing hostinfo lock and already have hostmap-pending") + if state[mutexKey{Type: "hostmap", SubType: "main"}] { + panic(fmt.Errorf("grabbing hostinfo lock and already have hostmap-main: state=%v add=%v", state, add)) } - } - if add == "hostmap-pending" { - if state["hostmap-main"] { - panic("grabbing hostmap-pending lock and already have hostmap-main") + if state[mutexKey{Type: "hostmap", SubType: "pending"}] { + panic(fmt.Errorf("grabbing hostinfo lock and already have hostmap-pending: state=%v add=%v", state, add)) + } + case "hostmap-pending": + if state[mutexKey{Type: "hostmap", SubType: "main"}] { + panic(fmt.Errorf("grabbing hostmap-pending lock and already have hostmap-main: state=%v add=%v", state, add)) } } } func (s *syncRWMutex) Lock() { - m := threadLocal.Get().(map[string]bool) - checkMutex(m, s.mutexType) - m[s.mutexType] = true + m := threadLocal.Get().(map[mutexKey]bool) + checkMutex(m, s.mutexKey) + m[s.mutexKey] = true s.RWMutex.Lock() } func (s *syncRWMutex) Unlock() { - m := threadLocal.Get().(map[string]bool) - m[s.mutexType] = false + m := threadLocal.Get().(map[mutexKey]bool) + m[s.mutexKey] = false s.RWMutex.Unlock() } func (s *syncRWMutex) RLock() { - m := threadLocal.Get().(map[string]bool) - checkMutex(m, s.mutexType) - m[s.mutexType] = true + m := threadLocal.Get().(map[mutexKey]bool) + checkMutex(m, s.mutexKey) + m[s.mutexKey] = true s.RWMutex.RLock() } func (s *syncRWMutex) RUnlock() { - m := threadLocal.Get().(map[string]bool) - m[s.mutexType] = false + m := threadLocal.Get().(map[mutexKey]bool) + m[s.mutexKey] = false s.RWMutex.RUnlock() } From 90e9a8e42cdb134e742e7c35104476396d39604a Mon Sep 17 00:00:00 2001 From: Wade Simmons Date: Tue, 9 May 2023 11:41:53 -0400 Subject: [PATCH 04/18] use delete --- mutex_debug.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mutex_debug.go b/mutex_debug.go index c692cb841..60aeb3f41 100644 --- a/mutex_debug.go +++ b/mutex_debug.go @@ -60,7 +60,7 @@ func (s *syncRWMutex) Lock() { func (s *syncRWMutex) Unlock() { m := threadLocal.Get().(map[mutexKey]bool) - m[s.mutexKey] = false + delete(m, s.mutexKey) s.RWMutex.Unlock() } @@ -73,6 +73,6 @@ func (s *syncRWMutex) RLock() { func (s *syncRWMutex) RUnlock() { m := threadLocal.Get().(map[mutexKey]bool) - m[s.mutexKey] = false + delete(m, s.mutexKey) s.RWMutex.RUnlock() } From e5789770b17d4e858981879fd72e13ef49cc7816 Mon Sep 17 00:00:00 2001 From: Wade Simmons Date: Tue, 9 May 2023 11:51:02 -0400 Subject: [PATCH 05/18] keep track of what file/line the locks were grabbed on --- mutex_debug.go | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/mutex_debug.go b/mutex_debug.go index 60aeb3f41..3210b1ed2 100644 --- a/mutex_debug.go +++ b/mutex_debug.go @@ -5,12 +5,13 @@ package nebula import ( "fmt" + "runtime" "sync" "github.com/timandy/routine" ) -var threadLocal routine.ThreadLocal = routine.NewThreadLocalWithInitial(func() any { return map[mutexKey]bool{} }) +var threadLocal routine.ThreadLocal = routine.NewThreadLocalWithInitial(func() any { return map[mutexKey]mutexValue{} }) type mutexKey struct { Type string @@ -18,6 +19,11 @@ type mutexKey struct { ID uint32 } +type mutexValue struct { + file string + line int +} + type syncRWMutex struct { sync.RWMutex mutexKey @@ -29,50 +35,54 @@ func newSyncRWMutex(key mutexKey) syncRWMutex { } } -func checkMutex(state map[mutexKey]bool, add mutexKey) { +func checkMutex(state map[mutexKey]mutexValue, add mutexKey) { switch add.Type { case "hostinfo": // Check for any other hostinfo keys: - for k, v := range state { - if k.Type == "hostinfo" && v { + for k := range state { + if k.Type == "hostinfo" { panic(fmt.Errorf("grabbing hostinfo lock and already have a hostinfo lock: state=%v add=%v", state, add)) } } - if state[mutexKey{Type: "hostmap", SubType: "main"}] { + if _, ok := state[mutexKey{Type: "hostmap", SubType: "main"}]; ok { panic(fmt.Errorf("grabbing hostinfo lock and already have hostmap-main: state=%v add=%v", state, add)) } - if state[mutexKey{Type: "hostmap", SubType: "pending"}] { + if _, ok := state[mutexKey{Type: "hostmap", SubType: "pending"}]; ok { panic(fmt.Errorf("grabbing hostinfo lock and already have hostmap-pending: state=%v add=%v", state, add)) } case "hostmap-pending": - if state[mutexKey{Type: "hostmap", SubType: "main"}] { + if _, ok := state[mutexKey{Type: "hostmap", SubType: "main"}]; ok { panic(fmt.Errorf("grabbing hostmap-pending lock and already have hostmap-main: state=%v add=%v", state, add)) } } } func (s *syncRWMutex) Lock() { - m := threadLocal.Get().(map[mutexKey]bool) + m := threadLocal.Get().(map[mutexKey]mutexValue) checkMutex(m, s.mutexKey) - m[s.mutexKey] = true + v := mutexValue{} + _, v.file, v.line, _ = runtime.Caller(1) + m[s.mutexKey] = v s.RWMutex.Lock() } func (s *syncRWMutex) Unlock() { - m := threadLocal.Get().(map[mutexKey]bool) + m := threadLocal.Get().(map[mutexKey]mutexValue) delete(m, s.mutexKey) s.RWMutex.Unlock() } func (s *syncRWMutex) RLock() { - m := threadLocal.Get().(map[mutexKey]bool) + m := threadLocal.Get().(map[mutexKey]mutexValue) checkMutex(m, s.mutexKey) - m[s.mutexKey] = true + v := mutexValue{} + _, v.file, v.line, _ = runtime.Caller(1) + m[s.mutexKey] = v s.RWMutex.RLock() } func (s *syncRWMutex) RUnlock() { - m := threadLocal.Get().(map[mutexKey]bool) + m := threadLocal.Get().(map[mutexKey]mutexValue) delete(m, s.mutexKey) s.RWMutex.RUnlock() } From 4c89b3c6a343c44c7bae2b5a530340661ff759d7 Mon Sep 17 00:00:00 2001 From: Wade Simmons Date: Mon, 21 Aug 2023 13:09:25 -0400 Subject: [PATCH 06/18] cleanup --- go.mod | 2 +- handshake_ix.go | 1 + handshake_manager.go | 5 +++-- hostmap.go | 2 +- mutex.go | 13 ++++++++++--- mutex_debug.go | 31 +++++++++++++++++++------------ 6 files changed, 35 insertions(+), 19 deletions(-) diff --git a/go.mod b/go.mod index 342e7bb81..e98b010c9 100644 --- a/go.mod +++ b/go.mod @@ -19,6 +19,7 @@ require ( github.com/skip2/go-qrcode v0.0.0-20200617195104-da1b6568686e github.com/songgao/water v0.0.0-20200317203138-2b4b6d7c09d8 github.com/stretchr/testify v1.8.4 + github.com/timandy/routine v1.1.1 github.com/vishvananda/netlink v1.1.0 golang.org/x/crypto v0.12.0 golang.org/x/exp v0.0.0-20230425010034-47ecfdc1ba53 @@ -43,7 +44,6 @@ require ( github.com/prometheus/common v0.42.0 // indirect github.com/prometheus/procfs v0.10.1 // indirect github.com/rogpeppe/go-internal v1.10.0 // indirect - github.com/timandy/routine v1.1.1 // indirect github.com/vishvananda/netns v0.0.4 // indirect golang.org/x/mod v0.10.0 // indirect golang.org/x/tools v0.8.0 // indirect diff --git a/handshake_ix.go b/handshake_ix.go index 52efdf5e6..09c65ad22 100644 --- a/handshake_ix.go +++ b/handshake_ix.go @@ -130,6 +130,7 @@ func ixHandshakeStage1(f *Interface, addr *udp.Addr, via *ViaSender, packet []by } hostinfo := &HostInfo{ + syncRWMutex: newSyncRWMutex(mutexKey{Type: mutexKeyTypeHostInfo, ID: uint32(vpnIp)}), ConnectionState: ci, localIndexId: myIndex, remoteIndexId: hs.Details.InitiatorIndex, diff --git a/handshake_manager.go b/handshake_manager.go index a70f4dbc3..1a172d373 100644 --- a/handshake_manager.go +++ b/handshake_manager.go @@ -7,7 +7,6 @@ import ( "encoding/binary" "errors" "net" - "sync" "time" "github.com/rcrowley/go-metrics" @@ -44,7 +43,7 @@ type HandshakeConfig struct { type HandshakeManager struct { // Mutex for interacting with the vpnIps and indexes maps - sync.RWMutex + syncRWMutex vpnIps map[iputil.VpnIp]*HostInfo indexes map[uint32]*HostInfo @@ -65,6 +64,7 @@ type HandshakeManager struct { func NewHandshakeManager(l *logrus.Logger, tunCidr *net.IPNet, preferredRanges []*net.IPNet, mainHostMap *HostMap, lightHouse *LightHouse, outside udp.Conn, config HandshakeConfig) *HandshakeManager { return &HandshakeManager{ + syncRWMutex: newSyncRWMutex(mutexKey{Type: mutexKeyTypeHandshakeManager}), vpnIps: map[iputil.VpnIp]*HostInfo{}, indexes: map[uint32]*HostInfo{}, mainHostMap: mainHostMap, @@ -308,6 +308,7 @@ func (c *HandshakeManager) AddVpnIp(vpnIp iputil.VpnIp, init func(*HostInfo)) *H } hostinfo := &HostInfo{ + syncRWMutex: newSyncRWMutex(mutexKey{Type: mutexKeyTypeHostInfo, ID: uint32(vpnIp)}), vpnIp: vpnIp, HandshakePacket: make(map[uint8][]byte, 0), relayState: RelayState{ diff --git a/hostmap.go b/hostmap.go index c82d73bc3..5f2a846a5 100644 --- a/hostmap.go +++ b/hostmap.go @@ -264,7 +264,7 @@ func NewHostMap(l *logrus.Logger, vpnCIDR *net.IPNet, preferredRanges []*net.IPN r := map[uint32]*HostInfo{} relays := map[uint32]*HostInfo{} m := HostMap{ - syncRWMutex: newSyncRWMutex(mutexKey{Type: "hostmap"}), + syncRWMutex: newSyncRWMutex(mutexKey{Type: mutexKeyTypeHostMap}), Indexes: i, Relays: relays, RemoteIndexes: r, diff --git a/mutex.go b/mutex.go index 181767468..1bb3c20a6 100644 --- a/mutex.go +++ b/mutex.go @@ -9,12 +9,19 @@ import ( type syncRWMutex = sync.RWMutex +type mutexKeyType string + +const ( + mutexKeyTypeHostMap mutexKeyType = "hostmap" + mutexKeyTypeHostInfo = "hostinfo" + mutexKeyTypeHandshakeManager = "handshake-manager" +) + func newSyncRWMutex(mutexKey) syncRWMutex { return sync.RWMutex{} } type mutexKey struct { - Type string - SubType string - ID uint32 + Type mutexKeyType + ID uint32 } diff --git a/mutex_debug.go b/mutex_debug.go index 3210b1ed2..169316c59 100644 --- a/mutex_debug.go +++ b/mutex_debug.go @@ -13,10 +13,17 @@ import ( var threadLocal routine.ThreadLocal = routine.NewThreadLocalWithInitial(func() any { return map[mutexKey]mutexValue{} }) +type mutexKeyType string + +const ( + mutexKeyTypeHostMap mutexKeyType = "hostmap" + mutexKeyTypeHostInfo = "hostinfo" + mutexKeyTypeHandshakeManager = "handshake-manager" +) + type mutexKey struct { - Type string - SubType string - ID uint32 + Type mutexKeyType + ID uint32 } type mutexValue struct { @@ -37,22 +44,22 @@ func newSyncRWMutex(key mutexKey) syncRWMutex { func checkMutex(state map[mutexKey]mutexValue, add mutexKey) { switch add.Type { - case "hostinfo": + case mutexKeyTypeHostInfo: // Check for any other hostinfo keys: for k := range state { - if k.Type == "hostinfo" { + if k.Type == mutexKeyTypeHostInfo { panic(fmt.Errorf("grabbing hostinfo lock and already have a hostinfo lock: state=%v add=%v", state, add)) } } - if _, ok := state[mutexKey{Type: "hostmap", SubType: "main"}]; ok { - panic(fmt.Errorf("grabbing hostinfo lock and already have hostmap-main: state=%v add=%v", state, add)) + if _, ok := state[mutexKey{Type: mutexKeyTypeHostMap}]; ok { + panic(fmt.Errorf("grabbing hostinfo lock and already have hostmap: state=%v add=%v", state, add)) } - if _, ok := state[mutexKey{Type: "hostmap", SubType: "pending"}]; ok { - panic(fmt.Errorf("grabbing hostinfo lock and already have hostmap-pending: state=%v add=%v", state, add)) + if _, ok := state[mutexKey{Type: mutexKeyTypeHandshakeManager}]; ok { + panic(fmt.Errorf("grabbing hostinfo lock and already have handshake-manager: state=%v add=%v", state, add)) } - case "hostmap-pending": - if _, ok := state[mutexKey{Type: "hostmap", SubType: "main"}]; ok { - panic(fmt.Errorf("grabbing hostmap-pending lock and already have hostmap-main: state=%v add=%v", state, add)) + case mutexKeyTypeHandshakeManager: + if _, ok := state[mutexKey{Type: mutexKeyTypeHostMap}]; ok { + panic(fmt.Errorf("grabbing handshake-manager lock and already have hostmap: state=%v add=%v", state, add)) } } } From 5ce82798752b2d9d841147307b677e7ea91f9ad5 Mon Sep 17 00:00:00 2001 From: Wade Simmons Date: Mon, 18 Dec 2023 21:01:26 -0500 Subject: [PATCH 07/18] update to work with the latest locks --- mutex_debug.go | 60 +++++++++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 25 deletions(-) diff --git a/mutex_debug.go b/mutex_debug.go index af3ba7b5d..605964b8f 100644 --- a/mutex_debug.go +++ b/mutex_debug.go @@ -5,9 +5,7 @@ package nebula import ( "fmt" - "log" "runtime" - "runtime/debug" "sync" "github.com/timandy/routine" @@ -23,6 +21,13 @@ const ( mutexKeyTypeHandshakeManager = "handshake-manager" ) +// For each Key in this map, the Value is a list of lock types you can already have +// when you want to grab that Key. This ensures that locks are always fetched +// in the same order, to prevent deadlocks. +var allowedConcurrentLocks = map[mutexKeyType][]mutexKeyType{ + mutexKeyTypeHandshakeManager: {mutexKeyTypeHostMap}, +} + type mutexKey struct { Type mutexKeyType ID uint32 @@ -45,37 +50,30 @@ func newSyncRWMutex(key mutexKey) syncRWMutex { } func alertMutex(err error) { - log.Print(err, string(debug.Stack())) + panic(err) + // NOTE: you could switch to this log Line and remove the panic if you want + // to log all failures instead of panicking on the first one + //log.Print(err, string(debug.Stack())) } func checkMutex(state map[mutexKey]mutexValue, add mutexKey) { - for k := range state { + allowedConcurrent := allowedConcurrentLocks[add.Type] + + for k, v := range state { if add == k { - alertMutex(fmt.Errorf("re-entrant lock: state=%v add=%v", state, add)) + alertMutex(fmt.Errorf("re-entrant lock: %s. previous allocation: %s", add, v)) } - } - switch add.Type { - case mutexKeyTypeHostInfo: - // Check for any other hostinfo keys: - for k := range state { - if k.Type == mutexKeyTypeHostInfo { - alertMutex(fmt.Errorf("grabbing hostinfo lock and already have a hostinfo lock: state=%v add=%v", state, add)) + // TODO use slices.Contains, but requires go1.21 + var found bool + for _, a := range allowedConcurrent { + if a == k.Type { + found = true + break } } - if _, ok := state[mutexKey{Type: mutexKeyTypeHostMap}]; ok { - alertMutex(fmt.Errorf("grabbing hostinfo lock and already have hostmap: state=%v add=%v", state, add)) - } - if _, ok := state[mutexKey{Type: mutexKeyTypeHandshakeManager}]; ok { - alertMutex(fmt.Errorf("grabbing hostinfo lock and already have handshake-manager: state=%v add=%v", state, add)) - } - // case mutexKeyTypeHandshakeManager: - // if _, ok := state[mutexKey{Type: mutexKeyTypeHostMap}]; ok { - // alertMutex(fmt.Errorf("grabbing handshake-manager lock and already have hostmap: state=%v add=%v", state, add)) - // } - case mutexKeyTypeHostMap: - if _, ok := state[mutexKey{Type: mutexKeyTypeHandshakeManager}]; ok { - alertMutex(fmt.Errorf("grabbing hostmap lock and already have handshake-manager: state=%v add=%v", state, add)) + if !found { + alertMutex(fmt.Errorf("grabbing %s lock and already have these locks: %s", add.Type, state)) } } } @@ -109,3 +107,15 @@ func (s *syncRWMutex) RUnlock() { delete(m, s.mutexKey) s.RWMutex.RUnlock() } + +func (m mutexKey) String() string { + if m.ID == 0 { + return fmt.Sprintf("%s", m.Type) + } else { + return fmt.Sprintf("%s(%d)", m.Type, m.ID) + } +} + +func (m mutexValue) String() string { + return fmt.Sprintf("%s:%d", m.file, m.line) +} From 4d88c0711a364064aaba84437e460a95b7d3579b Mon Sep 17 00:00:00 2001 From: Wade Simmons Date: Mon, 18 Dec 2023 21:04:05 -0500 Subject: [PATCH 08/18] gofmt --- handshake_manager.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/handshake_manager.go b/handshake_manager.go index 6176356a3..e49b9ba80 100644 --- a/handshake_manager.go +++ b/handshake_manager.go @@ -7,8 +7,8 @@ import ( "encoding/binary" "errors" "net" - "time" "sync" + "time" "github.com/rcrowley/go-metrics" "github.com/sirupsen/logrus" From 540a171ef8ec9f8ef6a64976e92b97626f3d7e91 Mon Sep 17 00:00:00 2001 From: Wade Simmons Date: Mon, 18 Dec 2023 22:28:24 -0500 Subject: [PATCH 09/18] WIP more locks --- connection_manager.go | 13 ++++---- connection_state.go | 4 +-- firewall.go | 4 +-- go.mod | 3 ++ go.sum | 7 ++++ handshake_manager.go | 4 +-- lighthouse.go | 4 +-- mutex.go | 58 ++++++++++++++++++++++++-------- mutex_debug.go | 78 +++++++++++++++++++++++++++---------------- mutex_nodebug.go | 19 +++++++++++ 10 files changed, 138 insertions(+), 56 deletions(-) create mode 100644 mutex_nodebug.go diff --git a/connection_manager.go b/connection_manager.go index a1897566a..924d19860 100644 --- a/connection_manager.go +++ b/connection_manager.go @@ -3,7 +3,6 @@ package nebula import ( "bytes" "context" - "sync" "time" "github.com/rcrowley/go-metrics" @@ -27,14 +26,14 @@ const ( type connectionManager struct { in map[uint32]struct{} - inLock *sync.RWMutex + inLock syncRWMutex out map[uint32]struct{} - outLock *sync.RWMutex + outLock syncRWMutex // relayUsed holds which relay localIndexs are in use relayUsed map[uint32]struct{} - relayUsedLock *sync.RWMutex + relayUsedLock syncRWMutex hostMap *HostMap trafficTimer *LockingTimerWheel[uint32] @@ -59,11 +58,11 @@ func newConnectionManager(ctx context.Context, l *logrus.Logger, intf *Interface nc := &connectionManager{ hostMap: intf.hostMap, in: make(map[uint32]struct{}), - inLock: &sync.RWMutex{}, + inLock: newSyncRWMutex(mutexKey{Type: mutexKeyTypeConnectionManagerIn}), out: make(map[uint32]struct{}), - outLock: &sync.RWMutex{}, + outLock: newSyncRWMutex(mutexKey{Type: mutexKeyTypeConnectionManagerOut}), relayUsed: make(map[uint32]struct{}), - relayUsedLock: &sync.RWMutex{}, + relayUsedLock: newSyncRWMutex(mutexKey{Type: mutexKeyTypeConnectionManagerRelayUsed}), trafficTimer: NewLockingTimerWheel[uint32](time.Millisecond*500, max), intf: intf, pendingDeletion: make(map[uint32]struct{}), diff --git a/connection_state.go b/connection_state.go index 8ef8b3a24..31d21028e 100644 --- a/connection_state.go +++ b/connection_state.go @@ -3,7 +3,6 @@ package nebula import ( "crypto/rand" "encoding/json" - "sync" "sync/atomic" "github.com/flynn/noise" @@ -23,7 +22,7 @@ type ConnectionState struct { initiator bool messageCounter atomic.Uint64 window *Bits - writeLock sync.Mutex + writeLock syncMutex } func NewConnectionState(l *logrus.Logger, cipher string, certState *CertState, initiator bool, pattern noise.HandshakePattern, psk []byte, pskStage int) *ConnectionState { @@ -71,6 +70,7 @@ func NewConnectionState(l *logrus.Logger, cipher string, certState *CertState, i initiator: initiator, window: b, myCert: certState.Certificate, + writeLock: newSyncMutex(mutexKey{Type: mutexKeyTypeConnectionStateWrite}), } return ci diff --git a/firewall.go b/firewall.go index 64fada36d..513aaf50a 100644 --- a/firewall.go +++ b/firewall.go @@ -11,7 +11,6 @@ import ( "reflect" "strconv" "strings" - "sync" "time" "github.com/rcrowley/go-metrics" @@ -78,7 +77,7 @@ type firewallMetrics struct { } type FirewallConntrack struct { - sync.Mutex + syncMutex Conns map[firewall.Packet]*conn TimerWheel *TimerWheel[firewall.Packet] @@ -149,6 +148,7 @@ func NewFirewall(l *logrus.Logger, tcpTimeout, UDPTimeout, defaultTimeout time.D return &Firewall{ Conntrack: &FirewallConntrack{ + syncMutex: newSyncMutex(mutexKey{Type: mutexKeyTypeFirewallConntrack}), Conns: make(map[firewall.Packet]*conn), TimerWheel: NewTimerWheel[firewall.Packet](min, max), }, diff --git a/go.mod b/go.mod index f84a9fcfc..da13f10a4 100644 --- a/go.mod +++ b/go.mod @@ -39,8 +39,11 @@ require ( github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect + github.com/emirpasic/gods v1.18.1 // indirect github.com/golang/protobuf v1.5.3 // indirect github.com/google/btree v1.0.1 // indirect + github.com/google/uuid v1.3.0 // indirect + github.com/heimdalr/dag v1.4.0 // indirect github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/prometheus/client_model v0.4.1-0.20230718164431-9a2bf3000d16 // indirect diff --git a/go.sum b/go.sum index 32c9d128c..876e35462 100644 --- a/go.sum +++ b/go.sum @@ -22,6 +22,8 @@ github.com/cyberdelia/go-metrics-graphite v0.0.0-20161219230853-39f87cc3b432/go. github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc= +github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ= github.com/flynn/noise v1.0.0 h1:DlTHqmzmvcEiKj+4RYo/imoswx/4r6iBlCMfVtrMXpQ= github.com/flynn/noise v1.0.0/go.mod h1:xbMo+0i6+IGbYdJhF31t2eR1BIU0CYc12+BNAKwUTag= github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as= @@ -31,6 +33,7 @@ github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9 github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk= github.com/go-logfmt/logfmt v0.5.0/go.mod h1:wCYkCAKZfumFQihp8CzCvQ3paCTfi41vtzG1KdI/P7A= github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= +github.com/go-test/deep v1.1.0/go.mod h1:5C2ZWiW0ErCdrYzpqxLbTX7MG14M9iiw8DgHncVwcsE= github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= @@ -58,6 +61,10 @@ github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gopacket v1.1.19 h1:ves8RnFZPGiFnTS0uPQStjwru6uO6h+nlr9j6fL7kF8= github.com/google/gopacket v1.1.19/go.mod h1:iJ8V8n6KS+z2U1A8pUwu8bW5SyEMkXJB8Yo/Vo+TKTo= +github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= +github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/heimdalr/dag v1.4.0 h1:zG3JA4RDVLc55k3AXAgfwa+EgBNZ0TkfOO3C29Ucpmg= +github.com/heimdalr/dag v1.4.0/go.mod h1:OCh6ghKmU0hPjtwMqWBoNxPmtRioKd1xSu7Zs4sbIqM= github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4= github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU= github.com/json-iterator/go v1.1.10/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= diff --git a/handshake_manager.go b/handshake_manager.go index e49b9ba80..62204b31e 100644 --- a/handshake_manager.go +++ b/handshake_manager.go @@ -7,7 +7,6 @@ import ( "encoding/binary" "errors" "net" - "sync" "time" "github.com/rcrowley/go-metrics" @@ -65,7 +64,7 @@ type HandshakeManager struct { } type HandshakeHostInfo struct { - sync.Mutex + syncMutex startTime time.Time // Time that we first started trying with this handshake ready bool // Is the handshake ready @@ -397,6 +396,7 @@ func (hm *HandshakeManager) StartHandshake(vpnIp iputil.VpnIp, cacheCb func(*Han } hh := &HandshakeHostInfo{ + syncMutex: newSyncMutex(mutexKey{Type: mutexKeyTypeHandshakeHostInfo, ID: uint32(vpnIp)}), hostinfo: hostinfo, startTime: time.Now(), } diff --git a/lighthouse.go b/lighthouse.go index 2193ad3ce..7fc573aad 100644 --- a/lighthouse.go +++ b/lighthouse.go @@ -7,7 +7,6 @@ import ( "fmt" "net" "net/netip" - "sync" "sync/atomic" "time" @@ -33,7 +32,7 @@ type netIpAndPort struct { type LightHouse struct { //TODO: We need a timer wheel to kick out vpnIps that haven't reported in a long time - sync.RWMutex //Because we concurrently read and write to our maps + syncRWMutex //Because we concurrently read and write to our maps ctx context.Context amLighthouse bool myVpnIp iputil.VpnIp @@ -101,6 +100,7 @@ func NewLightHouseFromConfig(ctx context.Context, l *logrus.Logger, c *config.C, ones, _ := myVpnNet.Mask.Size() h := LightHouse{ + syncRWMutex: newSyncRWMutex(mutexKey{Type: mutexKeyTypeLightHouse}), ctx: ctx, amLighthouse: amLighthouse, myVpnIp: iputil.Ip2VpnIp(myVpnNet.IP), diff --git a/mutex.go b/mutex.go index 1bb3c20a6..bcf6d8884 100644 --- a/mutex.go +++ b/mutex.go @@ -1,27 +1,59 @@ -//go:build !mutex_debug -// +build !mutex_debug - package nebula -import ( - "sync" -) - -type syncRWMutex = sync.RWMutex +import "fmt" type mutexKeyType string const ( - mutexKeyTypeHostMap mutexKeyType = "hostmap" - mutexKeyTypeHostInfo = "hostinfo" - mutexKeyTypeHandshakeManager = "handshake-manager" + mutexKeyTypeHostMap mutexKeyType = "hostmap" + + mutexKeyTypeLightHouse = "lighthouse" + mutexKeyTypeFirewallConntrack = "firewall-conntrack" + mutexKeyTypeHostInfo = "hostinfo" + mutexKeyTypeHandshakeHostInfo = "handshake-hostinfo" + mutexKeyTypeHandshakeManager = "handshake-manager" + mutexKeyTypeConnectionStateWrite = "connection-state-write-lock" + + mutexKeyTypeConnectionManagerIn = "connection-manager-in-lock" + mutexKeyTypeConnectionManagerOut = "connection-manager-out-lock" + mutexKeyTypeConnectionManagerRelayUsed = "connection-manager-relay-used-lock" ) -func newSyncRWMutex(mutexKey) syncRWMutex { - return sync.RWMutex{} +// For each Key in this map, the Value is a list of lock types you can already have +// when you want to grab that Key. This ensures that locks are always fetched +// in the same order, to prevent deadlocks. +var allowedConcurrentLocks = map[mutexKeyType][]mutexKeyType{ + mutexKeyTypeHostMap: {mutexKeyTypeHandshakeHostInfo}, + mutexKeyTypeFirewallConntrack: {mutexKeyTypeHandshakeHostInfo}, + + mutexKeyTypeHandshakeManager: {mutexKeyTypeHostMap}, + mutexKeyTypeConnectionStateWrite: {mutexKeyTypeHostMap}, + + mutexKeyTypeLightHouse: {mutexKeyTypeHandshakeManager}, + + mutexKeyTypeConnectionManagerIn: {mutexKeyTypeHostMap}, + mutexKeyTypeConnectionManagerOut: {mutexKeyTypeConnectionStateWrite, mutexKeyTypeConnectionManagerIn}, + mutexKeyTypeConnectionManagerRelayUsed: {mutexKeyTypeHandshakeHostInfo}, } type mutexKey struct { Type mutexKeyType ID uint32 } + +type mutexValue struct { + file string + line int +} + +func (m mutexKey) String() string { + if m.ID == 0 { + return fmt.Sprintf("%s", m.Type) + } else { + return fmt.Sprintf("%s(%d)", m.Type, m.ID) + } +} + +func (m mutexValue) String() string { + return fmt.Sprintf("%s:%d", m.file, m.line) +} diff --git a/mutex_debug.go b/mutex_debug.go index 605964b8f..ce52590d5 100644 --- a/mutex_debug.go +++ b/mutex_debug.go @@ -8,34 +8,42 @@ import ( "runtime" "sync" + "github.com/heimdalr/dag" "github.com/timandy/routine" ) var threadLocal routine.ThreadLocal = routine.NewThreadLocalWithInitial(func() any { return map[mutexKey]mutexValue{} }) -type mutexKeyType string +var allowedDAG *dag.DAG -const ( - mutexKeyTypeHostMap mutexKeyType = "hostmap" - mutexKeyTypeHostInfo = "hostinfo" - mutexKeyTypeHandshakeManager = "handshake-manager" -) - -// For each Key in this map, the Value is a list of lock types you can already have -// when you want to grab that Key. This ensures that locks are always fetched -// in the same order, to prevent deadlocks. -var allowedConcurrentLocks = map[mutexKeyType][]mutexKeyType{ - mutexKeyTypeHandshakeManager: {mutexKeyTypeHostMap}, -} +func init() { + allowedDAG = dag.NewDAG() + for k, v := range allowedConcurrentLocks { + allowedDAG.AddVertexByID(string(k), k) + for _, t := range v { + if _, err := allowedDAG.GetVertex(string(t)); err != nil { + allowedDAG.AddVertexByID(string(t), t) + } + } + } + for k, v := range allowedConcurrentLocks { + for _, t := range v { + allowedDAG.AddEdge(string(t), string(k)) + } + } -type mutexKey struct { - Type mutexKeyType - ID uint32 -} + for k := range allowedConcurrentLocks { + anc, err := allowedDAG.GetAncestors(string(k)) + if err != nil { + panic(err) + } -type mutexValue struct { - file string - line int + var allowed []mutexKeyType + for t := range anc { + allowed = append(allowed, mutexKeyType(t)) + } + allowedConcurrentLocks[k] = allowed + } } type syncRWMutex struct { @@ -43,12 +51,23 @@ type syncRWMutex struct { mutexKey } +type syncMutex struct { + sync.Mutex + mutexKey +} + func newSyncRWMutex(key mutexKey) syncRWMutex { return syncRWMutex{ mutexKey: key, } } +func newSyncMutex(key mutexKey) syncMutex { + return syncMutex{ + mutexKey: key, + } +} + func alertMutex(err error) { panic(err) // NOTE: you could switch to this log Line and remove the panic if you want @@ -108,14 +127,17 @@ func (s *syncRWMutex) RUnlock() { s.RWMutex.RUnlock() } -func (m mutexKey) String() string { - if m.ID == 0 { - return fmt.Sprintf("%s", m.Type) - } else { - return fmt.Sprintf("%s(%d)", m.Type, m.ID) - } +func (s *syncMutex) Lock() { + m := threadLocal.Get().(map[mutexKey]mutexValue) + checkMutex(m, s.mutexKey) + v := mutexValue{} + _, v.file, v.line, _ = runtime.Caller(1) + m[s.mutexKey] = v + s.Mutex.Lock() } -func (m mutexValue) String() string { - return fmt.Sprintf("%s:%d", m.file, m.line) +func (s *syncMutex) Unlock() { + m := threadLocal.Get().(map[mutexKey]mutexValue) + delete(m, s.mutexKey) + s.Mutex.Unlock() } diff --git a/mutex_nodebug.go b/mutex_nodebug.go new file mode 100644 index 000000000..823c4a7cd --- /dev/null +++ b/mutex_nodebug.go @@ -0,0 +1,19 @@ +//go:build !mutex_debug +// +build !mutex_debug + +package nebula + +import ( + "sync" +) + +type syncRWMutex = sync.RWMutex +type syncMutex = sync.Mutex + +func newSyncRWMutex(mutexKey) syncRWMutex { + return sync.RWMutex{} +} + +func newSyncMutex(mutexKey) syncMutex { + return sync.Mutex{} +} From bcaefce4ac2fc3124384a8bb8cb72445fff8446a Mon Sep 17 00:00:00 2001 From: Wade Simmons Date: Mon, 18 Dec 2023 22:38:52 -0500 Subject: [PATCH 10/18] more types --- handshake_ix.go | 1 + handshake_manager.go | 1 + hostmap.go | 3 +-- mutex.go | 5 +++++ remote_list.go | 12 ++++++------ 5 files changed, 14 insertions(+), 8 deletions(-) diff --git a/handshake_ix.go b/handshake_ix.go index f4a3106cc..1c7ff6c90 100644 --- a/handshake_ix.go +++ b/handshake_ix.go @@ -135,6 +135,7 @@ func ixHandshakeStage1(f *Interface, addr *udp.Addr, via *ViaSender, packet []by HandshakePacket: make(map[uint8][]byte, 0), lastHandshakeTime: hs.Details.Time, relayState: RelayState{ + syncRWMutex: newSyncRWMutex(mutexKey{Type: mutexKeyTypeRelayState, ID: uint32(vpnIp)}), relays: map[iputil.VpnIp]struct{}{}, relayForByIp: map[iputil.VpnIp]*Relay{}, relayForByIdx: map[uint32]*Relay{}, diff --git a/handshake_manager.go b/handshake_manager.go index 62204b31e..db7e2a564 100644 --- a/handshake_manager.go +++ b/handshake_manager.go @@ -389,6 +389,7 @@ func (hm *HandshakeManager) StartHandshake(vpnIp iputil.VpnIp, cacheCb func(*Han vpnIp: vpnIp, HandshakePacket: make(map[uint8][]byte, 0), relayState: RelayState{ + syncRWMutex: newSyncRWMutex(mutexKey{Type: mutexKeyTypeRelayState, ID: uint32(vpnIp)}), relays: map[iputil.VpnIp]struct{}{}, relayForByIp: map[iputil.VpnIp]*Relay{}, relayForByIdx: map[uint32]*Relay{}, diff --git a/hostmap.go b/hostmap.go index 1834705de..c5cbc5d28 100644 --- a/hostmap.go +++ b/hostmap.go @@ -3,7 +3,6 @@ package nebula import ( "errors" "net" - "sync" "sync/atomic" "time" @@ -67,7 +66,7 @@ type HostMap struct { // struct, make a copy of an existing value, edit the fileds in the copy, and // then store a pointer to the new copy in both realyForBy* maps. type RelayState struct { - sync.RWMutex + syncRWMutex relays map[iputil.VpnIp]struct{} // Set of VpnIp's of Hosts to use as relays to access this peer relayForByIp map[iputil.VpnIp]*Relay // Maps VpnIps of peers for which this HostInfo is a relay to some Relay info diff --git a/mutex.go b/mutex.go index bcf6d8884..21a47ae3b 100644 --- a/mutex.go +++ b/mutex.go @@ -8,8 +8,10 @@ const ( mutexKeyTypeHostMap mutexKeyType = "hostmap" mutexKeyTypeLightHouse = "lighthouse" + mutexKeyTypeRemoteList = "remote-list" mutexKeyTypeFirewallConntrack = "firewall-conntrack" mutexKeyTypeHostInfo = "hostinfo" + mutexKeyTypeRelayState = "relay-state" mutexKeyTypeHandshakeHostInfo = "handshake-hostinfo" mutexKeyTypeHandshakeManager = "handshake-manager" mutexKeyTypeConnectionStateWrite = "connection-state-write-lock" @@ -30,10 +32,13 @@ var allowedConcurrentLocks = map[mutexKeyType][]mutexKeyType{ mutexKeyTypeConnectionStateWrite: {mutexKeyTypeHostMap}, mutexKeyTypeLightHouse: {mutexKeyTypeHandshakeManager}, + mutexKeyTypeRemoteList: {mutexKeyTypeLightHouse}, mutexKeyTypeConnectionManagerIn: {mutexKeyTypeHostMap}, mutexKeyTypeConnectionManagerOut: {mutexKeyTypeConnectionStateWrite, mutexKeyTypeConnectionManagerIn}, mutexKeyTypeConnectionManagerRelayUsed: {mutexKeyTypeHandshakeHostInfo}, + + mutexKeyTypeRelayState: {mutexKeyTypeHostMap, mutexKeyTypeConnectionManagerRelayUsed}, } type mutexKey struct { diff --git a/remote_list.go b/remote_list.go index 60a1afdaf..a573e6531 100644 --- a/remote_list.go +++ b/remote_list.go @@ -7,7 +7,6 @@ import ( "net/netip" "sort" "strconv" - "sync" "sync/atomic" "time" @@ -190,7 +189,7 @@ func (hr *hostnamesResults) GetIPs() []netip.AddrPort { // It serves as a local cache of query replies, host update notifications, and locally learned addresses type RemoteList struct { // Every interaction with internals requires a lock! - sync.RWMutex + syncRWMutex // A deduplicated set of addresses. Any accessor should lock beforehand. addrs []*udp.Addr @@ -217,10 +216,11 @@ type RemoteList struct { // NewRemoteList creates a new empty RemoteList func NewRemoteList(shouldAdd func(netip.Addr) bool) *RemoteList { return &RemoteList{ - addrs: make([]*udp.Addr, 0), - relays: make([]*iputil.VpnIp, 0), - cache: make(map[iputil.VpnIp]*cache), - shouldAdd: shouldAdd, + syncRWMutex: newSyncRWMutex(mutexKey{Type: mutexKeyTypeRemoteList}), + addrs: make([]*udp.Addr, 0), + relays: make([]*iputil.VpnIp, 0), + cache: make(map[iputil.VpnIp]*cache), + shouldAdd: shouldAdd, } } From 6f27f4696536679de34285c75aa7efef1e90c1d8 Mon Sep 17 00:00:00 2001 From: Wade Simmons Date: Tue, 19 Dec 2023 09:10:00 -0500 Subject: [PATCH 11/18] simplify --- connection_manager.go | 6 ++-- connection_state.go | 2 +- firewall.go | 2 +- go.mod | 2 +- go.sum | 1 + handshake_ix.go | 4 +-- handshake_manager.go | 8 +++--- hostmap.go | 2 +- lighthouse.go | 2 +- mutex.go | 64 ------------------------------------------- mutex_debug.go | 53 +++++++++++++++++++++++++++-------- mutex_nodebug.go | 1 + remote_list.go | 2 +- 13 files changed, 59 insertions(+), 90 deletions(-) delete mode 100644 mutex.go diff --git a/connection_manager.go b/connection_manager.go index 924d19860..2bc94455f 100644 --- a/connection_manager.go +++ b/connection_manager.go @@ -58,11 +58,11 @@ func newConnectionManager(ctx context.Context, l *logrus.Logger, intf *Interface nc := &connectionManager{ hostMap: intf.hostMap, in: make(map[uint32]struct{}), - inLock: newSyncRWMutex(mutexKey{Type: mutexKeyTypeConnectionManagerIn}), + inLock: newSyncRWMutex("connection-manager-in"), out: make(map[uint32]struct{}), - outLock: newSyncRWMutex(mutexKey{Type: mutexKeyTypeConnectionManagerOut}), + outLock: newSyncRWMutex("connection-manager-out"), relayUsed: make(map[uint32]struct{}), - relayUsedLock: newSyncRWMutex(mutexKey{Type: mutexKeyTypeConnectionManagerRelayUsed}), + relayUsedLock: newSyncRWMutex("connection-manager-relay-used"), trafficTimer: NewLockingTimerWheel[uint32](time.Millisecond*500, max), intf: intf, pendingDeletion: make(map[uint32]struct{}), diff --git a/connection_state.go b/connection_state.go index 31d21028e..5373f967c 100644 --- a/connection_state.go +++ b/connection_state.go @@ -70,7 +70,7 @@ func NewConnectionState(l *logrus.Logger, cipher string, certState *CertState, i initiator: initiator, window: b, myCert: certState.Certificate, - writeLock: newSyncMutex(mutexKey{Type: mutexKeyTypeConnectionStateWrite}), + writeLock: newSyncMutex("connection-state-write"), } return ci diff --git a/firewall.go b/firewall.go index 513aaf50a..53d7b68d0 100644 --- a/firewall.go +++ b/firewall.go @@ -148,7 +148,7 @@ func NewFirewall(l *logrus.Logger, tcpTimeout, UDPTimeout, defaultTimeout time.D return &Firewall{ Conntrack: &FirewallConntrack{ - syncMutex: newSyncMutex(mutexKey{Type: mutexKeyTypeFirewallConntrack}), + syncMutex: newSyncMutex("firewall-conntrack"), Conns: make(map[firewall.Packet]*conn), TimerWheel: NewTimerWheel[firewall.Packet](min, max), }, diff --git a/go.mod b/go.mod index da13f10a4..d502622dc 100644 --- a/go.mod +++ b/go.mod @@ -10,6 +10,7 @@ require ( github.com/flynn/noise v1.0.0 github.com/gogo/protobuf v1.3.2 github.com/google/gopacket v1.1.19 + github.com/heimdalr/dag v1.4.0 github.com/kardianos/service v1.2.2 github.com/miekg/dns v1.1.56 github.com/nbrownus/go-metrics-prometheus v0.0.0-20210712211119-974a6260965f @@ -43,7 +44,6 @@ require ( github.com/golang/protobuf v1.5.3 // indirect github.com/google/btree v1.0.1 // indirect github.com/google/uuid v1.3.0 // indirect - github.com/heimdalr/dag v1.4.0 // indirect github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/prometheus/client_model v0.4.1-0.20230718164431-9a2bf3000d16 // indirect diff --git a/go.sum b/go.sum index 876e35462..4421a3d5b 100644 --- a/go.sum +++ b/go.sum @@ -33,6 +33,7 @@ github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9 github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk= github.com/go-logfmt/logfmt v0.5.0/go.mod h1:wCYkCAKZfumFQihp8CzCvQ3paCTfi41vtzG1KdI/P7A= github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= +github.com/go-test/deep v1.1.0 h1:WOcxcdHcvdgThNXjw0t76K42FXTU7HpNQWHpA2HHNlg= github.com/go-test/deep v1.1.0/go.mod h1:5C2ZWiW0ErCdrYzpqxLbTX7MG14M9iiw8DgHncVwcsE= github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= diff --git a/handshake_ix.go b/handshake_ix.go index 1c7ff6c90..1f930407f 100644 --- a/handshake_ix.go +++ b/handshake_ix.go @@ -127,7 +127,7 @@ func ixHandshakeStage1(f *Interface, addr *udp.Addr, via *ViaSender, packet []by } hostinfo := &HostInfo{ - syncRWMutex: newSyncRWMutex(mutexKey{Type: mutexKeyTypeHostInfo, ID: uint32(vpnIp)}), + syncRWMutex: newSyncRWMutex("hostinfo"), ConnectionState: ci, localIndexId: myIndex, remoteIndexId: hs.Details.InitiatorIndex, @@ -135,7 +135,7 @@ func ixHandshakeStage1(f *Interface, addr *udp.Addr, via *ViaSender, packet []by HandshakePacket: make(map[uint8][]byte, 0), lastHandshakeTime: hs.Details.Time, relayState: RelayState{ - syncRWMutex: newSyncRWMutex(mutexKey{Type: mutexKeyTypeRelayState, ID: uint32(vpnIp)}), + syncRWMutex: newSyncRWMutex("relay-state"), relays: map[iputil.VpnIp]struct{}{}, relayForByIp: map[iputil.VpnIp]*Relay{}, relayForByIdx: map[uint32]*Relay{}, diff --git a/handshake_manager.go b/handshake_manager.go index db7e2a564..5834938ea 100644 --- a/handshake_manager.go +++ b/handshake_manager.go @@ -102,7 +102,7 @@ func (hh *HandshakeHostInfo) cachePacket(l *logrus.Logger, t header.MessageType, func NewHandshakeManager(l *logrus.Logger, mainHostMap *HostMap, lightHouse *LightHouse, outside udp.Conn, config HandshakeConfig) *HandshakeManager { return &HandshakeManager{ - syncRWMutex: newSyncRWMutex(mutexKey{Type: mutexKeyTypeHandshakeManager}), + syncRWMutex: newSyncRWMutex("handshake-manager"), vpnIps: map[iputil.VpnIp]*HandshakeHostInfo{}, indexes: map[uint32]*HandshakeHostInfo{}, mainHostMap: mainHostMap, @@ -385,11 +385,11 @@ func (hm *HandshakeManager) StartHandshake(vpnIp iputil.VpnIp, cacheCb func(*Han } hostinfo := &HostInfo{ - syncRWMutex: newSyncRWMutex(mutexKey{Type: mutexKeyTypeHostInfo, ID: uint32(vpnIp)}), + syncRWMutex: newSyncRWMutex("hostinfo"), vpnIp: vpnIp, HandshakePacket: make(map[uint8][]byte, 0), relayState: RelayState{ - syncRWMutex: newSyncRWMutex(mutexKey{Type: mutexKeyTypeRelayState, ID: uint32(vpnIp)}), + syncRWMutex: newSyncRWMutex("relay-state"), relays: map[iputil.VpnIp]struct{}{}, relayForByIp: map[iputil.VpnIp]*Relay{}, relayForByIdx: map[uint32]*Relay{}, @@ -397,7 +397,7 @@ func (hm *HandshakeManager) StartHandshake(vpnIp iputil.VpnIp, cacheCb func(*Han } hh := &HandshakeHostInfo{ - syncMutex: newSyncMutex(mutexKey{Type: mutexKeyTypeHandshakeHostInfo, ID: uint32(vpnIp)}), + syncMutex: newSyncMutex("handshake-hostinfo"), hostinfo: hostinfo, startTime: time.Now(), } diff --git a/hostmap.go b/hostmap.go index c5cbc5d28..3cf316e3a 100644 --- a/hostmap.go +++ b/hostmap.go @@ -260,7 +260,7 @@ func NewHostMap(l *logrus.Logger, vpnCIDR *net.IPNet, preferredRanges []*net.IPN r := map[uint32]*HostInfo{} relays := map[uint32]*HostInfo{} m := HostMap{ - syncRWMutex: newSyncRWMutex(mutexKey{Type: mutexKeyTypeHostMap}), + syncRWMutex: newSyncRWMutex("hostmap"), Indexes: i, Relays: relays, RemoteIndexes: r, diff --git a/lighthouse.go b/lighthouse.go index 7fc573aad..29970e708 100644 --- a/lighthouse.go +++ b/lighthouse.go @@ -100,7 +100,7 @@ func NewLightHouseFromConfig(ctx context.Context, l *logrus.Logger, c *config.C, ones, _ := myVpnNet.Mask.Size() h := LightHouse{ - syncRWMutex: newSyncRWMutex(mutexKey{Type: mutexKeyTypeLightHouse}), + syncRWMutex: newSyncRWMutex("lighthouse"), ctx: ctx, amLighthouse: amLighthouse, myVpnIp: iputil.Ip2VpnIp(myVpnNet.IP), diff --git a/mutex.go b/mutex.go deleted file mode 100644 index 21a47ae3b..000000000 --- a/mutex.go +++ /dev/null @@ -1,64 +0,0 @@ -package nebula - -import "fmt" - -type mutexKeyType string - -const ( - mutexKeyTypeHostMap mutexKeyType = "hostmap" - - mutexKeyTypeLightHouse = "lighthouse" - mutexKeyTypeRemoteList = "remote-list" - mutexKeyTypeFirewallConntrack = "firewall-conntrack" - mutexKeyTypeHostInfo = "hostinfo" - mutexKeyTypeRelayState = "relay-state" - mutexKeyTypeHandshakeHostInfo = "handshake-hostinfo" - mutexKeyTypeHandshakeManager = "handshake-manager" - mutexKeyTypeConnectionStateWrite = "connection-state-write-lock" - - mutexKeyTypeConnectionManagerIn = "connection-manager-in-lock" - mutexKeyTypeConnectionManagerOut = "connection-manager-out-lock" - mutexKeyTypeConnectionManagerRelayUsed = "connection-manager-relay-used-lock" -) - -// For each Key in this map, the Value is a list of lock types you can already have -// when you want to grab that Key. This ensures that locks are always fetched -// in the same order, to prevent deadlocks. -var allowedConcurrentLocks = map[mutexKeyType][]mutexKeyType{ - mutexKeyTypeHostMap: {mutexKeyTypeHandshakeHostInfo}, - mutexKeyTypeFirewallConntrack: {mutexKeyTypeHandshakeHostInfo}, - - mutexKeyTypeHandshakeManager: {mutexKeyTypeHostMap}, - mutexKeyTypeConnectionStateWrite: {mutexKeyTypeHostMap}, - - mutexKeyTypeLightHouse: {mutexKeyTypeHandshakeManager}, - mutexKeyTypeRemoteList: {mutexKeyTypeLightHouse}, - - mutexKeyTypeConnectionManagerIn: {mutexKeyTypeHostMap}, - mutexKeyTypeConnectionManagerOut: {mutexKeyTypeConnectionStateWrite, mutexKeyTypeConnectionManagerIn}, - mutexKeyTypeConnectionManagerRelayUsed: {mutexKeyTypeHandshakeHostInfo}, - - mutexKeyTypeRelayState: {mutexKeyTypeHostMap, mutexKeyTypeConnectionManagerRelayUsed}, -} - -type mutexKey struct { - Type mutexKeyType - ID uint32 -} - -type mutexValue struct { - file string - line int -} - -func (m mutexKey) String() string { - if m.ID == 0 { - return fmt.Sprintf("%s", m.Type) - } else { - return fmt.Sprintf("%s(%d)", m.Type, m.ID) - } -} - -func (m mutexValue) String() string { - return fmt.Sprintf("%s:%d", m.file, m.line) -} diff --git a/mutex_debug.go b/mutex_debug.go index ce52590d5..b015c5dcd 100644 --- a/mutex_debug.go +++ b/mutex_debug.go @@ -12,35 +12,66 @@ import ( "github.com/timandy/routine" ) +type mutexKey = string + +// For each Key in this map, the Value is a list of lock types you can already have +// when you want to grab that Key. This ensures that locks are always fetched +// in the same order, to prevent deadlocks. +var allowedConcurrentLocks = map[mutexKey][]mutexKey{ + "connection-manager-in": {"hostmap"}, + "connection-manager-out": {"connection-state-write", "connection-manager-in"}, + "connection-manager-relay-used": {"handshake-hostinfo"}, + "connection-state-write": {"hostmap"}, + "firewall-conntrack": {"handshake-hostinfo"}, + "handshake-manager": {"hostmap"}, + "hostmap": {"handshake-hostinfo"}, + "lighthouse": {"handshake-manager"}, + "relay-state": {"hostmap", "connection-manager-relay-used"}, + "remote-list": {"lighthouse"}, +} + +type mutexValue struct { + file string + line int +} + +func (m mutexValue) String() string { + return fmt.Sprintf("%s:%d", m.file, m.line) +} + var threadLocal routine.ThreadLocal = routine.NewThreadLocalWithInitial(func() any { return map[mutexKey]mutexValue{} }) var allowedDAG *dag.DAG +// We build a directed acyclic graph to assert that the locks can only be +// acquired in a determined order, If there are cycles in the DAG, then we +// know that the locking order is not guaranteed. func init() { allowedDAG = dag.NewDAG() for k, v := range allowedConcurrentLocks { - allowedDAG.AddVertexByID(string(k), k) + _ = allowedDAG.AddVertexByID(k, k) for _, t := range v { - if _, err := allowedDAG.GetVertex(string(t)); err != nil { - allowedDAG.AddVertexByID(string(t), t) - } + _ = allowedDAG.AddVertexByID(t, t) } } for k, v := range allowedConcurrentLocks { for _, t := range v { - allowedDAG.AddEdge(string(t), string(k)) + if err := allowedDAG.AddEdge(t, k); err != nil { + panic(fmt.Errorf("Failed to assembled DAG for allowedConcurrentLocks: %w", err)) + } } } + // Rebuild allowedConcurrentLocks as a flattened list of all possibilities for k := range allowedConcurrentLocks { - anc, err := allowedDAG.GetAncestors(string(k)) + anc, err := allowedDAG.GetAncestors(k) if err != nil { panic(err) } - var allowed []mutexKeyType + var allowed []mutexKey for t := range anc { - allowed = append(allowed, mutexKeyType(t)) + allowed = append(allowed, mutexKey(t)) } allowedConcurrentLocks[k] = allowed } @@ -76,7 +107,7 @@ func alertMutex(err error) { } func checkMutex(state map[mutexKey]mutexValue, add mutexKey) { - allowedConcurrent := allowedConcurrentLocks[add.Type] + allowedConcurrent := allowedConcurrentLocks[add] for k, v := range state { if add == k { @@ -86,13 +117,13 @@ func checkMutex(state map[mutexKey]mutexValue, add mutexKey) { // TODO use slices.Contains, but requires go1.21 var found bool for _, a := range allowedConcurrent { - if a == k.Type { + if a == k { found = true break } } if !found { - alertMutex(fmt.Errorf("grabbing %s lock and already have these locks: %s", add.Type, state)) + alertMutex(fmt.Errorf("grabbing %s lock and already have these locks: %s", add, state)) } } } diff --git a/mutex_nodebug.go b/mutex_nodebug.go index 823c4a7cd..76086ff54 100644 --- a/mutex_nodebug.go +++ b/mutex_nodebug.go @@ -7,6 +7,7 @@ import ( "sync" ) +type mutexKey = string type syncRWMutex = sync.RWMutex type syncMutex = sync.Mutex diff --git a/remote_list.go b/remote_list.go index a573e6531..b07d15cc1 100644 --- a/remote_list.go +++ b/remote_list.go @@ -216,7 +216,7 @@ type RemoteList struct { // NewRemoteList creates a new empty RemoteList func NewRemoteList(shouldAdd func(netip.Addr) bool) *RemoteList { return &RemoteList{ - syncRWMutex: newSyncRWMutex(mutexKey{Type: mutexKeyTypeRemoteList}), + syncRWMutex: newSyncRWMutex("remote-list"), addrs: make([]*udp.Addr, 0), relays: make([]*iputil.VpnIp, 0), cache: make(map[iputil.VpnIp]*cache), From 26f7a9fd45f67c8a8fccf28c4530d3c8fff2a376 Mon Sep 17 00:00:00 2001 From: Wade Simmons Date: Tue, 19 Dec 2023 11:24:14 -0500 Subject: [PATCH 12/18] use terraform dag impl --- go.mod | 4 +--- go.sum | 10 ++-------- mutex_debug.go | 27 ++++++++++++++++----------- 3 files changed, 19 insertions(+), 22 deletions(-) diff --git a/go.mod b/go.mod index d502622dc..32323f05e 100644 --- a/go.mod +++ b/go.mod @@ -6,11 +6,11 @@ require ( dario.cat/mergo v1.0.0 github.com/anmitsu/go-shlex v0.0.0-20200514113438-38f4b401e2be github.com/armon/go-radix v1.0.0 + github.com/clarkmcc/go-dag v0.0.0-20220908000337-9c3ba5b365fc github.com/cyberdelia/go-metrics-graphite v0.0.0-20161219230853-39f87cc3b432 github.com/flynn/noise v1.0.0 github.com/gogo/protobuf v1.3.2 github.com/google/gopacket v1.1.19 - github.com/heimdalr/dag v1.4.0 github.com/kardianos/service v1.2.2 github.com/miekg/dns v1.1.56 github.com/nbrownus/go-metrics-prometheus v0.0.0-20210712211119-974a6260965f @@ -40,10 +40,8 @@ require ( github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect - github.com/emirpasic/gods v1.18.1 // indirect github.com/golang/protobuf v1.5.3 // indirect github.com/google/btree v1.0.1 // indirect - github.com/google/uuid v1.3.0 // indirect github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/prometheus/client_model v0.4.1-0.20230718164431-9a2bf3000d16 // indirect diff --git a/go.sum b/go.sum index 4421a3d5b..d05b15b39 100644 --- a/go.sum +++ b/go.sum @@ -17,13 +17,13 @@ github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6r github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/clarkmcc/go-dag v0.0.0-20220908000337-9c3ba5b365fc h1:6e91sWiDE69Jl0WUsY/LvTCBPRBe6b2j8H7W96JGJ4s= +github.com/clarkmcc/go-dag v0.0.0-20220908000337-9c3ba5b365fc/go.mod h1:RGIcF96ORCYAsdz60Ou9mPBNa4+DjoQFS8nelPniFoY= github.com/cyberdelia/go-metrics-graphite v0.0.0-20161219230853-39f87cc3b432 h1:M5QgkYacWj0Xs8MhpIK/5uwU02icXpEoSo9sM2aRCps= github.com/cyberdelia/go-metrics-graphite v0.0.0-20161219230853-39f87cc3b432/go.mod h1:xwIwAxMvYnVrGJPe2FKx5prTrnAjGOD8zvDOnxnrrkM= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc= -github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ= github.com/flynn/noise v1.0.0 h1:DlTHqmzmvcEiKj+4RYo/imoswx/4r6iBlCMfVtrMXpQ= github.com/flynn/noise v1.0.0/go.mod h1:xbMo+0i6+IGbYdJhF31t2eR1BIU0CYc12+BNAKwUTag= github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as= @@ -33,8 +33,6 @@ github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9 github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk= github.com/go-logfmt/logfmt v0.5.0/go.mod h1:wCYkCAKZfumFQihp8CzCvQ3paCTfi41vtzG1KdI/P7A= github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= -github.com/go-test/deep v1.1.0 h1:WOcxcdHcvdgThNXjw0t76K42FXTU7HpNQWHpA2HHNlg= -github.com/go-test/deep v1.1.0/go.mod h1:5C2ZWiW0ErCdrYzpqxLbTX7MG14M9iiw8DgHncVwcsE= github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= @@ -62,10 +60,6 @@ github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gopacket v1.1.19 h1:ves8RnFZPGiFnTS0uPQStjwru6uO6h+nlr9j6fL7kF8= github.com/google/gopacket v1.1.19/go.mod h1:iJ8V8n6KS+z2U1A8pUwu8bW5SyEMkXJB8Yo/Vo+TKTo= -github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= -github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/heimdalr/dag v1.4.0 h1:zG3JA4RDVLc55k3AXAgfwa+EgBNZ0TkfOO3C29Ucpmg= -github.com/heimdalr/dag v1.4.0/go.mod h1:OCh6ghKmU0hPjtwMqWBoNxPmtRioKd1xSu7Zs4sbIqM= github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4= github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU= github.com/json-iterator/go v1.1.10/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= diff --git a/mutex_debug.go b/mutex_debug.go index b015c5dcd..7ba966760 100644 --- a/mutex_debug.go +++ b/mutex_debug.go @@ -8,7 +8,7 @@ import ( "runtime" "sync" - "github.com/heimdalr/dag" + "github.com/clarkmcc/go-dag" "github.com/timandy/routine" ) @@ -41,37 +41,38 @@ func (m mutexValue) String() string { var threadLocal routine.ThreadLocal = routine.NewThreadLocalWithInitial(func() any { return map[mutexKey]mutexValue{} }) -var allowedDAG *dag.DAG +var allowedDAG dag.AcyclicGraph // We build a directed acyclic graph to assert that the locks can only be // acquired in a determined order, If there are cycles in the DAG, then we // know that the locking order is not guaranteed. func init() { - allowedDAG = dag.NewDAG() for k, v := range allowedConcurrentLocks { - _ = allowedDAG.AddVertexByID(k, k) + allowedDAG.Add(k) for _, t := range v { - _ = allowedDAG.AddVertexByID(t, t) + allowedDAG.Add(t) } } for k, v := range allowedConcurrentLocks { for _, t := range v { - if err := allowedDAG.AddEdge(t, k); err != nil { - panic(fmt.Errorf("Failed to assembled DAG for allowedConcurrentLocks: %w", err)) - } + allowedDAG.Connect(dag.BasicEdge(k, t)) } } + if cycles := allowedDAG.Cycles(); len(cycles) > 0 { + panic(fmt.Errorf("Cycles found in allowedConcurrentLocks: %v", cycles)) + } + // Rebuild allowedConcurrentLocks as a flattened list of all possibilities for k := range allowedConcurrentLocks { - anc, err := allowedDAG.GetAncestors(k) + ancestors, err := allowedDAG.Ancestors(k) if err != nil { panic(err) } var allowed []mutexKey - for t := range anc { - allowed = append(allowed, mutexKey(t)) + for t := range ancestors { + allowed = append(allowed, t.(mutexKey)) } allowedConcurrentLocks[k] = allowed } @@ -107,6 +108,10 @@ func alertMutex(err error) { } func checkMutex(state map[mutexKey]mutexValue, add mutexKey) { + if add == "" { + alertMutex(fmt.Errorf("mutex not initialized with mutexKey")) + } + allowedConcurrent := allowedConcurrentLocks[add] for k, v := range state { From 1be8dc43a75c08e1efaae72659deb3e7e57a6eb3 Mon Sep 17 00:00:00 2001 From: Wade Simmons Date: Mon, 5 Feb 2024 11:13:20 -0500 Subject: [PATCH 13/18] more --- connection_manager.go | 2 +- dns_server.go | 8 ++++---- handshake_manager.go | 2 +- mutex_debug.go | 2 ++ timeout.go | 6 +++--- 5 files changed, 11 insertions(+), 9 deletions(-) diff --git a/connection_manager.go b/connection_manager.go index 8b527645a..88ba86caa 100644 --- a/connection_manager.go +++ b/connection_manager.go @@ -64,7 +64,7 @@ func newConnectionManager(ctx context.Context, l *logrus.Logger, intf *Interface outLock: newSyncRWMutex("connection-manager-out"), relayUsed: make(map[uint32]struct{}), relayUsedLock: newSyncRWMutex("connection-manager-relay-used"), - trafficTimer: NewLockingTimerWheel[uint32](time.Millisecond*500, max), + trafficTimer: NewLockingTimerWheel[uint32]("connection-manager-timer", time.Millisecond*500, max), intf: intf, pendingDeletion: make(map[uint32]struct{}), checkInterval: checkInterval, diff --git a/dns_server.go b/dns_server.go index 3109b4cf7..6b1bd709e 100644 --- a/dns_server.go +++ b/dns_server.go @@ -5,7 +5,6 @@ import ( "net" "strconv" "strings" - "sync" "github.com/miekg/dns" "github.com/sirupsen/logrus" @@ -20,15 +19,16 @@ var dnsServer *dns.Server var dnsAddr string type dnsRecords struct { - sync.RWMutex + syncRWMutex dnsMap map[string]string hostMap *HostMap } func newDnsRecords(hostMap *HostMap) *dnsRecords { return &dnsRecords{ - dnsMap: make(map[string]string), - hostMap: hostMap, + syncRWMutex: newSyncRWMutex("dns-records"), + dnsMap: make(map[string]string), + hostMap: hostMap, } } diff --git a/handshake_manager.go b/handshake_manager.go index 2e6bcbd12..5d76ec929 100644 --- a/handshake_manager.go +++ b/handshake_manager.go @@ -110,7 +110,7 @@ func NewHandshakeManager(l *logrus.Logger, mainHostMap *HostMap, lightHouse *Lig outside: outside, config: config, trigger: make(chan iputil.VpnIp, config.triggerBuffer), - OutboundHandshakeTimer: NewLockingTimerWheel[iputil.VpnIp](config.tryInterval, hsTimeout(config.retries, config.tryInterval)), + OutboundHandshakeTimer: NewLockingTimerWheel[iputil.VpnIp]("handshake-manager-timer", config.tryInterval, hsTimeout(config.retries, config.tryInterval)), messageMetrics: config.messageMetrics, metricInitiated: metrics.GetOrRegisterCounter("handshake_manager.initiated", nil), metricTimedOut: metrics.GetOrRegisterCounter("handshake_manager.timed_out", nil), diff --git a/mutex_debug.go b/mutex_debug.go index 7ba966760..2fc6eb793 100644 --- a/mutex_debug.go +++ b/mutex_debug.go @@ -21,9 +21,11 @@ var allowedConcurrentLocks = map[mutexKey][]mutexKey{ "connection-manager-in": {"hostmap"}, "connection-manager-out": {"connection-state-write", "connection-manager-in"}, "connection-manager-relay-used": {"handshake-hostinfo"}, + "connection-manager-timer": {"connection-manager-out"}, "connection-state-write": {"hostmap"}, "firewall-conntrack": {"handshake-hostinfo"}, "handshake-manager": {"hostmap"}, + "handshake-manager-timer": {"handshake-manager"}, "hostmap": {"handshake-hostinfo"}, "lighthouse": {"handshake-manager"}, "relay-state": {"hostmap", "connection-manager-relay-used"}, diff --git a/timeout.go b/timeout.go index c1b4c398b..705e58f56 100644 --- a/timeout.go +++ b/timeout.go @@ -1,7 +1,6 @@ package nebula import ( - "sync" "time" ) @@ -34,7 +33,7 @@ type TimerWheel[T any] struct { } type LockingTimerWheel[T any] struct { - m sync.Mutex + m syncMutex t *TimerWheel[T] } @@ -81,8 +80,9 @@ func NewTimerWheel[T any](min, max time.Duration) *TimerWheel[T] { } // NewLockingTimerWheel is version of TimerWheel that is safe for concurrent use with a small performance penalty -func NewLockingTimerWheel[T any](min, max time.Duration) *LockingTimerWheel[T] { +func NewLockingTimerWheel[T any](name string, min, max time.Duration) *LockingTimerWheel[T] { return &LockingTimerWheel[T]{ + m: newSyncMutex(name), t: NewTimerWheel[T](min, max), } } From c7f1bed882659d6bd5374eca883f5be6c35d2ac8 Mon Sep 17 00:00:00 2001 From: Wade Simmons Date: Thu, 11 Apr 2024 12:58:25 -0400 Subject: [PATCH 14/18] avoid deadlock in lighthouse queryWorker If the lighthouse queryWorker tries to grab to call StartHandshake on a lighthouse vpnIp, we can deadlock on the handshake_manager lock. This change drops the handshake_manager lock before we send on the lighthouse queryChan (which could block), and also avoids sending to the channel if this is a lighthouse IP itself. --- handshake_manager.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/handshake_manager.go b/handshake_manager.go index b14b0fd37..241a16898 100644 --- a/handshake_manager.go +++ b/handshake_manager.go @@ -374,9 +374,9 @@ func (hm *HandshakeManager) GetOrHandshake(vpnIp iputil.VpnIp, cacheCb func(*Han // StartHandshake will ensure a handshake is currently being attempted for the provided vpn ip func (hm *HandshakeManager) StartHandshake(vpnIp iputil.VpnIp, cacheCb func(*HandshakeHostInfo)) *HostInfo { hm.Lock() - defer hm.Unlock() if hh, ok := hm.vpnIps[vpnIp]; ok { + hm.Unlock() // We are already trying to handshake with this vpn ip if cacheCb != nil { cacheCb(hh) @@ -421,7 +421,10 @@ func (hm *HandshakeManager) StartHandshake(vpnIp iputil.VpnIp, cacheCb func(*Han } } - hm.lightHouse.QueryServer(vpnIp) + hm.Unlock() + if !hm.lightHouse.IsLighthouseIP(vpnIp) { + hm.lightHouse.QueryServer(vpnIp) + } return hostinfo } From 2ff26b261db4f55c34812ce8c7c17b5392bf78e6 Mon Sep 17 00:00:00 2001 From: Wade Simmons Date: Thu, 11 Apr 2024 13:02:13 -0400 Subject: [PATCH 15/18] need to hold lock during cacheCb --- handshake_manager.go | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/handshake_manager.go b/handshake_manager.go index 241a16898..640227a7e 100644 --- a/handshake_manager.go +++ b/handshake_manager.go @@ -376,11 +376,11 @@ func (hm *HandshakeManager) StartHandshake(vpnIp iputil.VpnIp, cacheCb func(*Han hm.Lock() if hh, ok := hm.vpnIps[vpnIp]; ok { - hm.Unlock() // We are already trying to handshake with this vpn ip if cacheCb != nil { cacheCb(hh) } + hm.Unlock() return hh.hostinfo } @@ -422,9 +422,7 @@ func (hm *HandshakeManager) StartHandshake(vpnIp iputil.VpnIp, cacheCb func(*Han } hm.Unlock() - if !hm.lightHouse.IsLighthouseIP(vpnIp) { - hm.lightHouse.QueryServer(vpnIp) - } + hm.lightHouse.QueryServer(vpnIp) return hostinfo } From f2251645bbbfdc1738d75314e38d08374aa8790e Mon Sep 17 00:00:00 2001 From: Wade Simmons Date: Thu, 11 Apr 2024 12:57:27 -0400 Subject: [PATCH 16/18] chanDebug --- Makefile | 1 + lighthouse.go | 3 +++ mutex_debug.go | 13 +++++++++++++ mutex_nodebug.go | 3 +++ 4 files changed, 20 insertions(+) diff --git a/Makefile b/Makefile index b8fc2827b..ba1d2f0ed 100644 --- a/Makefile +++ b/Makefile @@ -206,6 +206,7 @@ ifeq ($(words $(MAKECMDGOALS)),1) @$(MAKE) service ${.DEFAULT_GOAL} --no-print-directory endif +bin-docker: BUILD_ARGS = -tags=mutex_debug bin-docker: bin build/linux-amd64/nebula build/linux-amd64/nebula-cert smoke-docker: bin-docker diff --git a/lighthouse.go b/lighthouse.go index daf122c63..bdabbd718 100644 --- a/lighthouse.go +++ b/lighthouse.go @@ -468,6 +468,7 @@ func (lh *LightHouse) QueryServer(ip iputil.VpnIp) { return } + chanDebugSend("lighthouse-query-chan") lh.queryChan <- ip } @@ -750,6 +751,8 @@ func (lh *LightHouse) startQueryWorker() { nb := make([]byte, 12, 12) out := make([]byte, mtu) + chanDebugRecv("lighthouse-query-chan") + for { select { case <-lh.ctx.Done(): diff --git a/mutex_debug.go b/mutex_debug.go index 2fc6eb793..ee065eab2 100644 --- a/mutex_debug.go +++ b/mutex_debug.go @@ -135,6 +135,19 @@ func checkMutex(state map[mutexKey]mutexValue, add mutexKey) { } } +func chanDebugRecv(key mutexKey) { + m := threadLocal.Get().(map[mutexKey]mutexValue) + checkMutex(m, key) + v := mutexValue{} + _, v.file, v.line, _ = runtime.Caller(1) + m[key] = v +} + +func chanDebugSend(key mutexKey) { + m := threadLocal.Get().(map[mutexKey]mutexValue) + checkMutex(m, key) +} + func (s *syncRWMutex) Lock() { m := threadLocal.Get().(map[mutexKey]mutexValue) checkMutex(m, s.mutexKey) diff --git a/mutex_nodebug.go b/mutex_nodebug.go index 76086ff54..87e0c5a95 100644 --- a/mutex_nodebug.go +++ b/mutex_nodebug.go @@ -18,3 +18,6 @@ func newSyncRWMutex(mutexKey) syncRWMutex { func newSyncMutex(mutexKey) syncMutex { return sync.Mutex{} } + +func chanDebugRecv(key mutexKey) {} +func chanDebugSend(key mutexKey) {} From 1704d7f75aac30fcb875221fe1c1501c57aa847e Mon Sep 17 00:00:00 2001 From: Wade Simmons Date: Tue, 28 May 2024 13:22:47 -0400 Subject: [PATCH 17/18] allow more locks --- mutex_debug.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mutex_debug.go b/mutex_debug.go index ee065eab2..ce5050a57 100644 --- a/mutex_debug.go +++ b/mutex_debug.go @@ -26,10 +26,11 @@ var allowedConcurrentLocks = map[mutexKey][]mutexKey{ "firewall-conntrack": {"handshake-hostinfo"}, "handshake-manager": {"hostmap"}, "handshake-manager-timer": {"handshake-manager"}, - "hostmap": {"handshake-hostinfo"}, + "hostmap": {"handshake-hostinfo", "lighthouse-query-chan"}, "lighthouse": {"handshake-manager"}, "relay-state": {"hostmap", "connection-manager-relay-used"}, "remote-list": {"lighthouse"}, + "lighthouse-query-chan": {"handshake-hostinfo"}, } type mutexValue struct { From 77eced39ddcc7941df83b7e15ae4a89bbef27035 Mon Sep 17 00:00:00 2001 From: Wade Simmons Date: Tue, 28 May 2024 13:28:38 -0400 Subject: [PATCH 18/18] run smoke test with mutex_debug --- .github/workflows/smoke.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/smoke.yml b/.github/workflows/smoke.yml index 54833bdc7..f7a73d314 100644 --- a/.github/workflows/smoke.yml +++ b/.github/workflows/smoke.yml @@ -26,7 +26,7 @@ jobs: check-latest: true - name: build - run: make bin-docker CGO_ENABLED=1 BUILD_ARGS=-race + run: make bin-docker CGO_ENABLED=1 BUILD_ARGS="-race -tags=mutex_debug" - name: setup docker image working-directory: ./.github/workflows/smoke