Skip to content

Commit

Permalink
chore: major refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
dimaqq committed Jan 17, 2025
1 parent 61f887c commit 60ab778
Show file tree
Hide file tree
Showing 8 changed files with 99 additions and 37 deletions.
5 changes: 2 additions & 3 deletions dont-merge/send-traces.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,11 @@
opentelemetry.trace.get_tracer_provider().add_span_processor(span_processor) # type: ignore


@tracer.start_as_current_span("some label") # type: ignore
@tracer.start_as_current_span('some label') # type: ignore
def main(foo: int = 42):
"""Do something."""
# can't add attributes to a decorator, if needed use the below instead
#
#
# with tracer.start_as_current_span("some label") as span:
# span.set_attribute('foo', 'bar')
# span.add_event('sample_event', {'event_attr': 123})
Expand All @@ -61,4 +61,3 @@ def main(foo: int = 42):
main()
# from typing_extensions import reveal_type
# reveal_type(main)

16 changes: 10 additions & 6 deletions ops/_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -557,19 +557,23 @@ def main(charm_class: Type[ops.charm.CharmBase], use_juju_for_storage: Optional[
See `ops.main() <#ops-main-entry-point>`_ for details.
"""
setup_tracing(charm_class.__name__)

# FIXME temp testing
for _ in range(9999):
import time

time.sleep(0.0001)
with tracer.start_as_current_span('test only'):
...

# opentelemetry-api types are broken
# https://github.com/open-telemetry/opentelemetry-python/issues/3836
for i in range(999):
with tracer.start_as_current_span('test only'): ...

with tracer.start_as_current_span('ops.main'): # type: ignore
try:
manager = _Manager(charm_class, use_juju_for_storage=use_juju_for_storage)

manager.run()
except _Abort as e:
shutdown_tracing()
sys.exit(e.exit_code)
finally:
print("X"*99)
shutdown_tracing()
shutdown_tracing()
4 changes: 2 additions & 2 deletions ops/tracing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@

from ops.tracing._hacks import remove_stale_otel_sdk_packages


# FIXME must this hack be run before OTEL packages are imported?
remove_stale_otel_sdk_packages()

Expand All @@ -49,7 +48,8 @@ def shutdown_tracing() -> None:
try:
_fixme.shutdown_tracing()
except Exception:
logging.exception("failed to flush tracing")
logging.exception('failed to flush tracing')


def reset_tracing():
"""FIXME: decide if this should be public, maybe it's oinly for testing?"""
Expand Down
9 changes: 5 additions & 4 deletions ops/tracing/_buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.
"""FIXME Docstring."""

from __future__ import annotations

import logging
Expand All @@ -18,7 +19,7 @@
import tempfile
from typing import IO

BUFFER_SAFETY_LIMIT = 64 * 1024 ** 2
BUFFER_SAFETY_LIMIT = 64 * 1024**2
SEPARATOR = b'__CHARM_TRACING_BUFFER_SPAN_SEP__'
"""Exact, verbatim value that separates buffered chunks."""

Expand All @@ -36,10 +37,10 @@ class Buffer:
"""

def __init__(self):
self.file = tempfile.TemporaryFile(mode='wb+')
self.file = tempfile.TemporaryFile(mode='wb+') # noqa: SIM115

def pivot(self, buffer_path: pathlib.Path) -> None:
"""Pivot fron anonymous temporary file to a named buffer file."""
"""Pivot from anonymous temporary file to a named buffer file."""
self.file.seek(0)
data = self.file.read()

Expand All @@ -53,7 +54,7 @@ def append(self, data: bytes) -> None:
# or against doubling by pivot to the very same path.

if load and data and load + len(SEPARATOR) + len(data) > BUFFER_SAFETY_LIMIT:
logger.warning("Buffer full, dropping old data")
logger.warning('Buffer full, dropping old data')
self.file.seek(0)
self.file.truncate(0)
load = 0
Expand Down
73 changes: 57 additions & 16 deletions ops/tracing/_fixme.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,31 +15,31 @@

import logging
import os
import time
from pathlib import Path
from typing import Callable, Sequence, Type
from typing import Callable, Sequence, Type

from opentelemetry.exporter.otlp.proto.common._internal.trace_encoder import (
encode_spans,
)
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace import ReadableSpan, TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor, SpanExporter, SpanExportResult
from opentelemetry.sdk.trace.export import BatchSpanProcessor, SpanExporter, SpanExportResult
from opentelemetry.trace import get_tracer_provider, set_tracer_provider

import ops
import ops.jujucontext
import ops.tracing._buffer


logger = logging.getLogger(__name__)

_OTLP_SPAN_EXPORTER_TIMEOUT = 1 # seconds
"""How much to give OTLP span exporter has to push traces to the backend."""


class ProxySpanExporter(SpanExporter):
real_exporter: SpanExporter|None
real_exporter: SpanExporter | None
buffer: ops.tracing._buffer.Buffer

def __init__(self):
Expand All @@ -51,28 +51,59 @@ def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult:
# Note:
# this is called in a helper thread, which is daemonic,
# the MainThread will wait at most 10s for this thread.
# Margins:
# - 1s safety margin
# - 1s for buffered data time overhang
# - 2s for live data
deadline = time.monotonic() + 6

# __import__("pdb").set_trace()
import threading
print("E"*33 + str(threading.current_thread()))

print('Ex' * 20 + str(threading.current_thread()))

if self.real_exporter:
buffered = self.buffer.load()
print(f"{len(buffered)=}")
print(f'{len(buffered)=}')
for chunk in buffered:
if time.monotonic() > deadline:
break
if not self.real_exporter._export(chunk).ok: # type: ignore
break
else:
self.buffer.drop()

# Note: [] --> b''
data: bytes = encode_spans(spans).SerializePartialToString()
print(f"{len(data)=} {len(spans)=}")
print(f'{len(data)=} {len(spans)=}')

sent = False
if self.real_exporter:
if self.real_exporter and time.monotonic() < deadline:
sent = self.real_exporter.export(spans) == SpanExportResult.SUCCESS

print(f'{sent=}')

# FIXME a couple of strategies are possible, but all thave downsides:
#
# Send buffered first, then send live data or buffer it
# - what if there's too much live data, and we're killed?
#
# Send some buffered data, then send live data or buffer it
# - what to do with remaining buffered data?
#
# Buffer new data first, then send as much as possible
# - what to do with remaining buffered data?
#
# What to do with remaining buffered data?
# - on partial send, rewriting the file:
# it's expensive...
#
# - leave data in the buffer on partial send:
# erase is cheap
# re-sending traces is allowed in OTEL
# However..
# if there's a lot of data / receiver is slow,
# we'll end up with a grwoing buffer
# until such time that buffer is full and is reset
if not sent:
self.buffer.append(data)

Expand All @@ -95,8 +126,10 @@ def get_server_cert(
server_cert_attr: str,
charm_instance: ops.CharmBase,
charm_type: Type[ops.CharmBase],
) -> str|Path|None:
_server_cert: str|Path|None|Callable[[], str|Path|None] = getattr(charm_instance, server_cert_attr)
) -> str | Path | None:
_server_cert: str | Path | None | Callable[[], str | Path | None] = getattr(
charm_instance, server_cert_attr
)
server_cert = _server_cert() if callable(_server_cert) else _server_cert

if server_cert is None:
Expand All @@ -116,7 +149,7 @@ def get_server_cert(
def setup_tracing(charm_class_name: str) -> None:
# FIXME would it be better to pass Juju context explicitly?
juju_context = ops.jujucontext._JujuContext.from_dict(os.environ)
app_name = "" if juju_context.unit_name is None else juju_context.unit_name.split('/')[0]
app_name = '' if juju_context.unit_name is None else juju_context.unit_name.split('/')[0]
service_name = f'{app_name}-charm' # only one COS charm sets custom value

resource = Resource.create(
Expand All @@ -136,19 +169,27 @@ def setup_tracing(charm_class_name: str) -> None:
exporter = ProxySpanExporter()

# real exporter, hardcoded for now
real_exporter = OTLPSpanExporter(endpoint='http://localhost:4318/v1/traces')
real_exporter._MAX_RETRY_TIMEOUT = 4 # type: ignore
real_exporter = OTLPSpanExporter(endpoint='http://localhost:4318/v1/traces', timeout=1)
# This is actually the max delay value in the sequence 1, 2, ..., MAX
# Set to 1 to disable sending live data (buffered data is still eventually sent)
# Set to 2 (or more) to enable sending live data (after buffered)
#
# _MAX_RETRY_TIMEOUT = 2 with timeout=1 means:
# - 1 attempt to send live, 1s sleep in the worst case
# _MAX_RETRY_TIMEOUT = 3 or 4 with timeout=1 means:
# - 1st attempt, 1s sleep, 2nd attempt, 1s sleep in the worst case
real_exporter._MAX_RETRY_TIMEOUT = 2 # type: ignore
exporter.set_real_exporter(real_exporter)

# How

span_processor = BatchSpanProcessor(exporter)
provider.add_span_processor(span_processor)
print("S" * 99)
print('St' * 50)
set_tracer_provider(provider)


def shutdown_tracing() -> None:
"""Shutdown tracing, which typically flushes data out."""
print("F"*99)
print('Sh' * 50)
get_tracer_provider().shutdown() # type: ignore
5 changes: 2 additions & 3 deletions ops/tracing/_hacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,11 @@
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.
"""Workarounds for various Juju bugs."""

from __future__ import annotations

import os
import logging
import os
import shutil
from collections import defaultdict
from typing import Any
Expand All @@ -37,7 +38,6 @@ def remove_stale_otel_sdk_packages() -> None:
This only has an effect if executed on an upgrade-charm event.
"""

if os.getenv('JUJU_DISPATCH_PATH') != 'hooks/upgrade-charm':
return

Expand Down Expand Up @@ -65,4 +65,3 @@ def remove_stale_otel_sdk_packages() -> None:
shutil.rmtree(path)

logger.debug('Successfully applied _remove_stale_otel_sdk_packages patch. ')

18 changes: 15 additions & 3 deletions ops/tracing/_leftovers.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
# Copyright 2022 Canonical Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this
# file except in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.
"""FIXME Docstring."""


class SomethingLater:
def flush(self) -> None:
Expand Down Expand Up @@ -126,7 +139,7 @@ def wrap_init(self: CharmBase, framework: Framework, *args, **kwargs):
# until tracing comes online
buffer_only = True

server_cert: str|Path|None = (
server_cert: str | Path | None = (
_get_server_cert(server_cert_attr, self, charm_type) if server_cert_attr else None
)

Expand Down Expand Up @@ -410,7 +423,6 @@ def trace_type(cls: _T) -> _T:
return cls



def trace_method(method: _F, name: str | None = None) -> _F:
"""Trace this method.
Expand All @@ -429,7 +441,6 @@ def trace_function(function: _F, name: str | None = None) -> _F:
return _trace_callable(function, 'function', name=name)



def _trace_callable(
callable_: _F,
qualifier: str,
Expand All @@ -445,6 +456,7 @@ def wrapped_function(*args, **kwargs): # type: ignore
)
# FIXME do we want this magical auto-instrumentation at all?
import typing_extensions

typing_extensions.reveal_type(autoinstrument_tracer.start_as_current_span)
with autoinstrument_tracer.start_as_current_span(f'{qualifier} call: {name_}'):
return callable(*args, **kwargs) # type: ignore
Expand Down
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,12 @@ keep-runtime-typing = true
"RUF012", # Mutable class attributes should be annotated with `typing.ClassVar`
"UP031", # Use format specifiers instead of percent format
]
# FIXME don't merge this, remove the file instead
"ops/tracing/_leftovers.py" = [
"F821",
"E303",
"E227",
]

[tool.ruff.lint.pydocstyle]
convention = "google"
Expand Down

0 comments on commit 60ab778

Please sign in to comment.