-
Notifications
You must be signed in to change notification settings - Fork 3.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
chore(metastore): Update metastores after Flushing a dataobj #15883
Changes from all commits
1c694ba
b4278d6
9fd28d1
223df08
3769350
e68af68
a643a04
7997dcb
13b6404
684010d
af79870
3bfef7d
aad0c8e
d3f7d90
0ad0d5e
ecb5d86
3fe3fd4
226f515
b1825d7
1800951
3e59b90
16c29be
d1c8ee2
82c2c31
1b4633a
b978510
8188ee7
e49df95
d3eaabb
ee473a0
0b5be06
80ee08e
abcf9c7
a829763
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,6 +9,9 @@ import ( | |
"errors" | ||
"flag" | ||
"fmt" | ||
"io" | ||
"sort" | ||
"time" | ||
|
||
"github.com/grafana/dskit/flagext" | ||
lru "github.com/hashicorp/golang-lru/v2" | ||
|
@@ -138,6 +141,11 @@ const ( | |
builderStateFlush | ||
) | ||
|
||
type FlushResult struct { | ||
Path string | ||
MinTimestamp, MaxTimestamp time.Time | ||
} | ||
|
||
// NewBuilder creates a new Builder which stores data objects for the specified | ||
// tenant in a bucket. | ||
// | ||
|
@@ -180,6 +188,39 @@ func NewBuilder(cfg BuilderConfig, bucket objstore.Bucket, tenantID string) (*Bu | |
}, nil | ||
} | ||
|
||
// FromExisting updates this builder with content from an existing data object, replicating all the state like stream IDs and logs. | ||
func (b *Builder) FromExisting(f io.ReadSeeker) error { | ||
if b.currentSizeEstimate > 0 { | ||
return fmt.Errorf("builder already has data, cannot use FromExisting") | ||
} | ||
|
||
dec := encoding.ReadSeekerDecoder(f) | ||
|
||
var streamIDs = make(map[int64]*labels.Labels, 32) | ||
for result := range streams.Iter(context.Background(), dec) { | ||
stream, err := result.Value() | ||
if err != nil { | ||
return err | ||
} | ||
sort.Sort(stream.Labels) | ||
streamIDs[stream.ID] = &stream.Labels | ||
} | ||
|
||
for result := range logs.Iter(context.Background(), dec) { | ||
record, err := result.Value() | ||
if err != nil { | ||
return err | ||
} | ||
streamLabels := streamIDs[record.StreamID] | ||
|
||
b.streams.Record(*streamLabels, record.Timestamp) | ||
b.logs.Append(record) | ||
} | ||
|
||
b.state = builderStateDirty | ||
return nil | ||
} | ||
|
||
// Append buffers a stream to be written to a data object. Append returns an | ||
// error if the stream labels cannot be parsed or [ErrBufferFull] if the | ||
// builder is full. | ||
|
@@ -286,15 +327,10 @@ func streamSizeEstimate(stream logproto.Stream) int { | |
// If Flush builds an object but fails to upload it to object storage, the | ||
// built object is cached and can be retried. [Builder.Reset] can be called to | ||
// discard any pending data and allow new data to be appended. | ||
func (b *Builder) Flush(ctx context.Context) error { | ||
switch b.state { | ||
case builderStateEmpty: | ||
return nil // Nothing to flush | ||
case builderStateDirty: | ||
if err := b.buildObject(); err != nil { | ||
return fmt.Errorf("building object: %w", err) | ||
} | ||
b.state = builderStateFlush | ||
func (b *Builder) Flush(ctx context.Context) (FlushResult, error) { | ||
_, err := b.FlushToBuffer() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. By the way, this changes the behaviour of Flush where calling Flush immediately after a successful flush will cause it to re-write the same object. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've moved Reset back into this method now I'm returning the FlushResult summary. Does that fix the issue? |
||
if err != nil { | ||
return FlushResult{}, err | ||
} | ||
|
||
timer := prometheus.NewTimer(b.metrics.flushTime) | ||
|
@@ -305,11 +341,32 @@ func (b *Builder) Flush(ctx context.Context) error { | |
|
||
objectPath := fmt.Sprintf("tenant-%s/objects/%s/%s", b.tenantID, sumStr[:b.cfg.SHAPrefixSize], sumStr[b.cfg.SHAPrefixSize:]) | ||
if err := b.bucket.Upload(ctx, objectPath, bytes.NewReader(b.flushBuffer.Bytes())); err != nil { | ||
return err | ||
return FlushResult{}, fmt.Errorf("uploading object: %w", err) | ||
} | ||
|
||
minTimestamp, maxTimestamp := b.streams.GetBounds() | ||
|
||
b.Reset() | ||
return nil | ||
|
||
return FlushResult{ | ||
Path: objectPath, | ||
MinTimestamp: minTimestamp, | ||
MaxTimestamp: maxTimestamp, | ||
}, nil | ||
} | ||
|
||
func (b *Builder) FlushToBuffer() (*bytes.Buffer, error) { | ||
switch b.state { | ||
case builderStateEmpty: | ||
return nil, nil // Nothing to flush | ||
case builderStateDirty: | ||
if err := b.buildObject(); err != nil { | ||
return nil, fmt.Errorf("building object: %w", err) | ||
} | ||
b.state = builderStateFlush | ||
} | ||
|
||
return b.flushBuffer, nil | ||
} | ||
|
||
func (b *Builder) buildObject() error { | ||
|
@@ -345,6 +402,7 @@ func (b *Builder) Reset() { | |
b.state = builderStateEmpty | ||
b.flushBuffer.Reset() | ||
b.metrics.sizeEstimate.Set(0) | ||
b.currentSizeEstimate = 0 | ||
} | ||
|
||
// RegisterMetrics registers metrics about builder to report to reg. All | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,9 +3,11 @@ | |
package streams | ||
|
||
import ( | ||
"context" | ||
"errors" | ||
"fmt" | ||
"sort" | ||
"sync" | ||
"time" | ||
|
||
"github.com/prometheus/client_golang/prometheus" | ||
|
@@ -16,6 +18,7 @@ | |
"github.com/grafana/loki/v3/pkg/dataobj/internal/encoding" | ||
"github.com/grafana/loki/v3/pkg/dataobj/internal/metadata/datasetmd" | ||
"github.com/grafana/loki/v3/pkg/dataobj/internal/metadata/streamsmd" | ||
"github.com/grafana/loki/v3/pkg/dataobj/internal/result" | ||
"github.com/grafana/loki/v3/pkg/dataobj/internal/streamio" | ||
"github.com/grafana/loki/v3/pkg/dataobj/internal/util/sliceclear" | ||
) | ||
|
@@ -33,6 +36,20 @@ | |
Rows int // Number of rows in the stream. | ||
} | ||
|
||
func (s *Stream) Reset() { | ||
s.ID = 0 | ||
s.Labels = nil | ||
s.MinTimestamp = time.Time{} | ||
s.MaxTimestamp = time.Time{} | ||
s.Rows = 0 | ||
} | ||
|
||
var streamPool = sync.Pool{ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this pool added about 10% more ops in the benchmark when re-using dataobjs between metastores. In reality we won't be reusing Streams very often (once per flush), so it might not be worth keeping this pool as the Stream objects will likely be deallocated between runs. |
||
New: func() interface{} { | ||
return &Stream{} | ||
}, | ||
} | ||
|
||
// Streams tracks information about streams in a data object. | ||
type Streams struct { | ||
metrics *Metrics | ||
|
@@ -61,10 +78,26 @@ | |
return &Streams{ | ||
metrics: metrics, | ||
pageSize: pageSize, | ||
lookup: make(map[uint64][]*Stream), | ||
lookup: make(map[uint64][]*Stream, 1024), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This yielded a 10-20% speed up over the baseline. We're not likely to have this many Streams in a metastore but it would be worth it for the logs dataobj, I think. |
||
ordered: make([]*Stream, 0, 1024), | ||
} | ||
} | ||
|
||
func (s *Streams) Iter(ctx context.Context) result.Seq[Stream] { | ||
return result.Iter(func(yield func(Stream) bool) error { | ||
for _, stream := range s.ordered { | ||
if !yield(*stream) { | ||
return nil | ||
} | ||
} | ||
return nil | ||
}) | ||
} | ||
|
||
func (s *Streams) GetBounds() (time.Time, time.Time) { | ||
return s.globalMinTimestamp, s.globalMaxTimestamp | ||
} | ||
|
||
// Record a stream record within the Streams section. The provided timestamp is | ||
// used to track the minimum and maximum timestamp of a stream. The number of | ||
// calls to Record is used to track the number of rows for a stream. | ||
|
@@ -153,7 +186,11 @@ | |
s.currentLabelsSize += len(lbl.Value) | ||
} | ||
|
||
newStream := &Stream{ID: s.lastID.Add(1), Labels: streamLabels} | ||
newStream := streamPool.Get().(*Stream) | ||
newStream.Reset() | ||
newStream.ID = s.lastID.Add(1) | ||
newStream.Labels = streamLabels | ||
|
||
s.lookup[hash] = append(s.lookup[hash], newStream) | ||
s.ordered = append(s.ordered, newStream) | ||
s.metrics.streamCount.Inc() | ||
|
@@ -187,7 +224,6 @@ | |
func (s *Streams) EncodeTo(enc *encoding.Encoder) error { | ||
timer := prometheus.NewTimer(s.metrics.encodeSeconds) | ||
defer timer.ObserveDuration() | ||
defer s.Reset() | ||
|
||
// TODO(rfratto): handle one section becoming too large. This can happen when | ||
// the number of columns is very wide. There are two approaches to handle | ||
|
@@ -333,6 +369,9 @@ | |
// Reset resets all state, allowing Streams to be reused. | ||
func (s *Streams) Reset() { | ||
s.lastID.Store(0) | ||
for _, stream := range s.ordered { | ||
streamPool.Put(stream) | ||
} | ||
clear(s.lookup) | ||
s.ordered = sliceclear.Clear(s.ordered) | ||
s.currentLabelsSize = 0 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not a massive fan of this method, and I think it could be made more efficient. I want to test it out before committing to any improvements here though.