From b15f8c8be63bcf808612fb208d130777f4927a82 Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Tue, 22 Apr 2025 20:19:51 +0000 Subject: [PATCH 01/21] Add a timeout to export calls --- .../otlp/proto/grpc/_log_exporter/__init__.py | 12 +- .../exporter/otlp/proto/grpc/exporter.py | 137 ++++++--------- .../proto/grpc/metric_exporter/__init__.py | 22 ++- .../proto/grpc/trace_exporter/__init__.py | 14 +- .../test-requirements.txt | 1 + .../tests/test_otlp_exporter_mixin.py | 166 ++++++++++-------- .../tests/test_otlp_metrics_exporter.py | 6 +- .../tests/test_otlp_trace_exporter.py | 10 +- .../otlp/proto/http/_log_exporter/__init__.py | 35 ++-- .../proto/http/metric_exporter/__init__.py | 33 ++-- .../proto/http/trace_exporter/__init__.py | 58 +++--- .../metrics/test_otlp_metrics_exporter.py | 70 +++++--- .../tests/test_proto_log_exporter.py | 68 ++++--- .../tests/test_proto_span_exporter.py | 90 ++++++---- .../sdk/environment_variables/__init__.py | 8 +- 15 files changed, 407 insertions(+), 323 deletions(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/_log_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/_log_exporter/__init__.py index 8f629899d77..b6a286ad27a 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/_log_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/_log_exporter/__init__.py @@ -58,7 +58,7 @@ def __init__( headers: Optional[ Union[TypingSequence[Tuple[str, str]], Dict[str, str], str] ] = None, - timeout: Optional[int] = None, + timeout: Optional[float] = None, compression: Optional[Compression] = None, ): if insecure is None: @@ -79,7 +79,7 @@ def __init__( environ_timeout = environ.get(OTEL_EXPORTER_OTLP_LOGS_TIMEOUT) environ_timeout = ( - int(environ_timeout) if environ_timeout is not None else None + float(environ_timeout) if environ_timeout is not None else None ) compression = ( @@ -107,8 +107,12 @@ def _translate_data( ) -> ExportLogsServiceRequest: return encode_logs(data) - def export(self, batch: Sequence[LogData]) -> LogExportResult: - return self._export(batch) + def export( + self, batch: Sequence[LogData], timeout_millis: Optional[float] = None + ) -> LogExportResult: + return self._export( + batch, timeout_millis / 1e3 if timeout_millis else None + ) def shutdown(self, timeout_millis: float = 30_000, **kwargs) -> None: OTLPExporterMixin.shutdown(self, timeout_millis=timeout_millis) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py index 79270b99a0c..d169d1e5a80 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py @@ -14,12 +14,12 @@ """OTLP Exporter""" +import json import threading from abc import ABC, abstractmethod from collections.abc import Sequence # noqa: F401 from logging import getLogger from os import environ -from time import sleep from typing import ( # noqa: F401 Any, Callable, @@ -35,7 +35,6 @@ from urllib.parse import urlparse from deprecated import deprecated -from google.rpc.error_details_pb2 import RetryInfo from grpc import ( ChannelCredentials, @@ -47,7 +46,6 @@ ssl_channel_credentials, ) from opentelemetry.exporter.otlp.proto.common._internal import ( - _create_exp_backoff_generator, _get_resource_data, ) from opentelemetry.exporter.otlp.proto.grpc import ( @@ -74,6 +72,29 @@ from opentelemetry.sdk.trace import ReadableSpan from opentelemetry.util.re import parse_env_headers +json_config = json.dumps( + { + "methodConfig": [ + { + "name": [dict()], + "retryPolicy": { + "maxAttempts": 5, + "initialBackoff": "1s", + "maxBackoff": "64s", + "backoffMultiplier": 2, + "retryableStatusCodes": [ + "UNAVAILABLE", + "CANCELLED", + "RESOURCE_EXHAUSTED", + "ABORTED", + "OUT_OF_RANGE", + "DATA_LOSS", + ], + }, + } + ] + } +) logger = getLogger(__name__) SDKDataT = TypeVar("SDKDataT") ResourceDataT = TypeVar("ResourceDataT") @@ -195,7 +216,7 @@ def __init__( headers: Optional[ Union[TypingSequence[Tuple[str, str]], Dict[str, str], str] ] = None, - timeout: Optional[int] = None, + timeout: Optional[float] = None, compression: Optional[Compression] = None, ): super().__init__() @@ -232,7 +253,7 @@ def __init__( else: self._headers = tuple(self._headers) + tuple(_OTLP_GRPC_HEADERS) - self._timeout = timeout or int( + self._timeout = timeout or float( environ.get(OTEL_EXPORTER_OTLP_TIMEOUT, 10) ) self._collector_kwargs = None @@ -245,7 +266,11 @@ def __init__( if insecure: self._channel = insecure_channel( - self._endpoint, compression=compression + self._endpoint, + compression=compression, + options=[ + ("grpc.service_config", json_config), + ], ) else: credentials = _get_credentials( @@ -255,7 +280,12 @@ def __init__( OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE, ) self._channel = secure_channel( - self._endpoint, credentials, compression=compression + self._endpoint, + credentials, + compression=compression, + options=[ + ("grpc.service_config", json_config), + ], ) self._client = self._stub(self._channel) @@ -269,10 +299,10 @@ def _translate_data( pass def _export( - self, data: Union[TypingSequence[ReadableSpan], MetricsData] + self, + data: Union[TypingSequence[ReadableSpan], MetricsData], + timeout_sec: Optional[float] = None, ) -> ExportResultT: - # After the call to shutdown, subsequent calls to Export are - # not allowed and should return a Failure result. if self._shutdown: logger.warning("Exporter already shutdown, ignoring batch") return self._result.FAILURE @@ -280,79 +310,24 @@ def _export( # FIXME remove this check if the export type for traces # gets updated to a class that represents the proto # TracesData and use the code below instead. - # logger.warning( - # "Transient error %s encountered while exporting %s, retrying in %ss.", - # error.code(), - # data.__class__.__name__, - # delay, - # ) - max_value = 64 - # expo returns a generator that yields delay values which grow - # exponentially. Once delay is greater than max_value, the yielded - # value will remain constant. - for delay in _create_exp_backoff_generator(max_value=max_value): - if delay == max_value or self._shutdown: + with self._export_lock: + try: + self._client.Export( + request=self._translate_data(data), + metadata=self._headers, + timeout=(timeout_sec or self._timeout), + ) + return self._result.SUCCESS + except RpcError as error: + logger.error( + "Failed to export %s to %s, error code: %s", + self._exporting, + self._endpoint, + error.code(), + exc_info=error.code() == StatusCode.UNKNOWN, + ) return self._result.FAILURE - with self._export_lock: - try: - self._client.Export( - request=self._translate_data(data), - metadata=self._headers, - timeout=self._timeout, - ) - - return self._result.SUCCESS - - except RpcError as error: - if error.code() in [ - StatusCode.CANCELLED, - StatusCode.DEADLINE_EXCEEDED, - StatusCode.RESOURCE_EXHAUSTED, - StatusCode.ABORTED, - StatusCode.OUT_OF_RANGE, - StatusCode.UNAVAILABLE, - StatusCode.DATA_LOSS, - ]: - retry_info_bin = dict(error.trailing_metadata()).get( - "google.rpc.retryinfo-bin" - ) - if retry_info_bin is not None: - retry_info = RetryInfo() - retry_info.ParseFromString(retry_info_bin) - delay = ( - retry_info.retry_delay.seconds - + retry_info.retry_delay.nanos / 1.0e9 - ) - - logger.warning( - ( - "Transient error %s encountered while exporting " - "%s to %s, retrying in %ss." - ), - error.code(), - self._exporting, - self._endpoint, - delay, - ) - sleep(delay) - continue - else: - logger.error( - "Failed to export %s to %s, error code: %s", - self._exporting, - self._endpoint, - error.code(), - exc_info=error.code() == StatusCode.UNKNOWN, - ) - - if error.code() == StatusCode.OK: - return self._result.SUCCESS - - return self._result.FAILURE - - return self._result.FAILURE - def shutdown(self, timeout_millis: float = 30_000, **kwargs) -> None: if self._shutdown: logger.warning("Exporter already shutdown, ignoring call") diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/metric_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/metric_exporter/__init__.py index 8580dbb7386..8bd52fe80a9 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/metric_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/metric_exporter/__init__.py @@ -13,10 +13,11 @@ from __future__ import annotations +import time from dataclasses import replace from logging import getLogger from os import environ -from typing import Iterable, List, Tuple, Union +from typing import Iterable, List, Optional, Tuple, Union from typing import Sequence as TypingSequence from grpc import ChannelCredentials, Compression @@ -99,7 +100,7 @@ def __init__( credentials: ChannelCredentials | None = None, headers: Union[TypingSequence[Tuple[str, str]], dict[str, str], str] | None = None, - timeout: int | None = None, + timeout: float | None = None, compression: Compression | None = None, preferred_temporality: dict[type, AggregationTemporality] | None = None, @@ -124,7 +125,7 @@ def __init__( environ_timeout = environ.get(OTEL_EXPORTER_OTLP_METRICS_TIMEOUT) environ_timeout = ( - int(environ_timeout) if environ_timeout is not None else None + float(environ_timeout) if environ_timeout is not None else None ) compression = ( @@ -158,17 +159,22 @@ def _translate_data( def export( self, metrics_data: MetricsData, - timeout_millis: float = 10_000, + timeout_millis: Optional[float] = None, **kwargs, ) -> MetricExportResult: - # TODO(#2663): OTLPExporterMixin should pass timeout to gRPC + timeout_sec = ( + timeout_millis / 1e3 if timeout_millis else self._timeout # pylint: disable=protected-access + ) if self._max_export_batch_size is None: - return self._export(data=metrics_data) + return self._export(metrics_data, timeout_sec) export_result = MetricExportResult.SUCCESS - + deadline_sec = time.time() + timeout_sec for split_metrics_data in self._split_metrics_data(metrics_data): - split_export_result = self._export(data=split_metrics_data) + time_remaining_sec = deadline_sec - time.time() + split_export_result = self._export( + split_metrics_data, time_remaining_sec + ) if split_export_result is MetricExportResult.FAILURE: export_result = MetricExportResult.FAILURE diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/trace_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/trace_exporter/__init__.py index c78c1b81bb6..5303d0fa840 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/trace_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/trace_exporter/__init__.py @@ -91,7 +91,7 @@ def __init__( headers: Optional[ Union[TypingSequence[Tuple[str, str]], Dict[str, str], str] ] = None, - timeout: Optional[int] = None, + timeout: Optional[float] = None, compression: Optional[Compression] = None, ): if insecure is None: @@ -112,7 +112,7 @@ def __init__( environ_timeout = environ.get(OTEL_EXPORTER_OTLP_TRACES_TIMEOUT) environ_timeout = ( - int(environ_timeout) if environ_timeout is not None else None + float(environ_timeout) if environ_timeout is not None else None ) compression = ( @@ -139,8 +139,14 @@ def _translate_data( ) -> ExportTraceServiceRequest: return encode_spans(data) - def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult: - return self._export(spans) + def export( + self, + spans: Sequence[ReadableSpan], + timeout_millis: Optional[float] = None, + ) -> SpanExportResult: + return self._export( + spans, timeout_millis / 1e3 if timeout_millis else None + ) def shutdown(self) -> None: OTLPExporterMixin.shutdown(self) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/test-requirements.txt b/exporter/opentelemetry-exporter-otlp-proto-grpc/test-requirements.txt index 28d778461a9..01c9f1ddadd 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/test-requirements.txt +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/test-requirements.txt @@ -2,6 +2,7 @@ asgiref==3.7.2 Deprecated==1.2.14 googleapis-common-protos==1.63.2 grpcio==1.66.2 +grpcio-status==1.66.0 importlib-metadata==6.11.0 iniconfig==2.0.0 packaging==24.0 diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py index 656d9a6cb79..5a75595f693 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py @@ -15,18 +15,14 @@ import threading import time from concurrent.futures import ThreadPoolExecutor -from logging import WARNING +from logging import WARNING, getLogger from typing import Any, Optional, Sequence from unittest import TestCase -from unittest.mock import Mock, patch +from unittest.mock import ANY, Mock, patch -from google.protobuf.duration_pb2 import ( # pylint: disable=no-name-in-module - Duration, -) -from google.rpc.error_details_pb2 import ( # pylint: disable=no-name-in-module - RetryInfo, -) -from grpc import Compression, StatusCode, server +from google.rpc import code_pb2, status_pb2 +from grpc import Compression, server +from grpc_status import rpc_status from opentelemetry.exporter.otlp.proto.common.trace_encoder import ( encode_spans, @@ -55,6 +51,8 @@ SpanExportResult, ) +logger = getLogger(__name__) + # The below tests use this test SpanExporter and Spans, but are testing the # underlying behavior in the mixin. A MetricExporter or LogExporter could @@ -73,8 +71,14 @@ def _translate_data( ) -> ExportTraceServiceRequest: return encode_spans(data) - def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult: - return self._export(spans) + def export( + self, + spans: Sequence[ReadableSpan], + timeout_millis: Optional[float] = None, + ) -> SpanExportResult: + return self._export( + spans, timeout_millis / 1e3 if timeout_millis else None + ) @property def _exporting(self): @@ -87,40 +91,25 @@ def shutdown(self, timeout_millis=30_000): class TraceServiceServicerWithExportParams(TraceServiceServicer): def __init__( self, - export_result: StatusCode, + export_result: code_pb2, optional_export_sleep: Optional[float] = None, - optional_export_retry_millis: Optional[float] = None, ): self.export_result = export_result self.optional_export_sleep = optional_export_sleep - self.optional_export_retry_millis = optional_export_retry_millis # pylint: disable=invalid-name,unused-argument def Export(self, request, context): + logger.warning("Export Request Recieved") if self.optional_export_sleep: time.sleep(self.optional_export_sleep) - if self.optional_export_retry_millis: - context.send_initial_metadata( - ( - ( - "google.rpc.retryinfo-bin", - RetryInfo().SerializeToString(), - ), - ) - ) - context.set_trailing_metadata( - ( - ( - "google.rpc.retryinfo-bin", - RetryInfo( - retry_delay=Duration( - nanos=int(self.optional_export_retry_millis) - ) - ).SerializeToString(), - ), + if self.export_result != code_pb2.OK: + context.abort_with_status( + rpc_status.to_status( + status_pb2.Status( + code=self.export_result, + ) ) ) - context.set_code(self.export_result) return ExportTraceServiceResponse() @@ -268,7 +257,9 @@ def test_otlp_exporter_otlp_compression_unspecified( """No env or kwarg should be NoCompression""" OTLPSpanExporterForTesting(insecure=True) mock_insecure_channel.assert_called_once_with( - "localhost:4317", compression=Compression.NoCompression + "localhost:4317", + compression=Compression.NoCompression, + options=ANY, ) # pylint: disable=no-self-use, disable=unused-argument @@ -292,12 +283,12 @@ def test_otlp_exporter_otlp_compression_envvar( """Just OTEL_EXPORTER_OTLP_COMPRESSION should work""" OTLPSpanExporterForTesting(insecure=True) mock_insecure_channel.assert_called_once_with( - "localhost:4317", compression=Compression.Gzip + "localhost:4317", compression=Compression.Gzip, options=ANY ) def test_shutdown(self): add_TraceServiceServicer_to_server( - TraceServiceServicerWithExportParams(StatusCode.OK), + TraceServiceServicerWithExportParams(code_pb2.OK), self.server, ) self.assertEqual( @@ -316,7 +307,7 @@ def test_shutdown(self): def test_shutdown_wait_last_export(self): add_TraceServiceServicer_to_server( TraceServiceServicerWithExportParams( - StatusCode.OK, optional_export_sleep=1 + code_pb2.OK, optional_export_sleep=1 ), self.server, ) @@ -337,7 +328,7 @@ def test_shutdown_wait_last_export(self): def test_shutdown_doesnot_wait_last_export(self): add_TraceServiceServicer_to_server( TraceServiceServicerWithExportParams( - StatusCode.OK, optional_export_sleep=3 + code_pb2.OK, optional_export_sleep=3 ), self.server, ) @@ -360,7 +351,7 @@ def test_export_over_closed_grpc_channel(self): # pylint: disable=protected-access add_TraceServiceServicer_to_server( - TraceServiceServicerWithExportParams(StatusCode.OK), + TraceServiceServicerWithExportParams(code_pb2.OK), self.server, ) self.exporter.export([self.span]) @@ -372,52 +363,79 @@ def test_export_over_closed_grpc_channel(self): str(err.exception), "Cannot invoke RPC on closed channel!" ) - @patch( - "opentelemetry.exporter.otlp.proto.grpc.exporter._create_exp_backoff_generator" - ) - @patch("opentelemetry.exporter.otlp.proto.grpc.exporter.sleep") - def test_unavailable(self, mock_sleep, mock_expo): - mock_expo.configure_mock(**{"return_value": [0.01]}) - + def test_retry_timeout(self): add_TraceServiceServicer_to_server( - TraceServiceServicerWithExportParams(StatusCode.UNAVAILABLE), + TraceServiceServicerWithExportParams(code_pb2.UNAVAILABLE), self.server, ) - result = self.exporter.export([self.span]) - self.assertEqual(result, SpanExportResult.FAILURE) - mock_sleep.assert_called_with(0.01) - - @patch("opentelemetry.exporter.otlp.proto.grpc.exporter.sleep") - def test_unavailable_delay(self, mock_sleep): + with self.assertLogs(level=WARNING) as warning: + # Set timeout to 1.5 seconds + self.assertEqual( + self.exporter.export([self.span], 1500), + SpanExportResult.FAILURE, + ) + # Our GRPC retry policy starts with a 1 second backoff then doubles. + # So we expect just two calls: one at time 0, one at time 1. + # The final log is from when export fails. + self.assertEqual(len(warning.records), 3) + for idx, log in enumerate(warning.records): + if idx != 2: + self.assertEqual( + "Export Request Recieved", + log.message, + ) + else: + self.assertEqual( + "Failed to export traces to localhost:4317, error code: StatusCode.DEADLINE_EXCEEDED", + log.message, + ) + with self.assertLogs(level=WARNING) as warning: + exporter = OTLPSpanExporterForTesting(insecure=True, timeout=3.5) + # This time don't pass in a timeout to export, so it should fallback to the timeout + # passed to the exporter class. + # pylint: disable=protected-access + self.assertEqual(exporter._timeout, 3.5) + self.assertEqual( + exporter.export([self.span]), + SpanExportResult.FAILURE, + ) + # We expect 3 calls: time 0, time 1, time 3, but not time 7. + # The final log is from when export fails. + self.assertEqual(len(warning.records), 4) + for idx, log in enumerate(warning.records): + if idx != 3: + self.assertEqual( + "Export Request Recieved", + log.message, + ) + else: + self.assertEqual( + "Failed to export traces to localhost:4317, error code: StatusCode.DEADLINE_EXCEEDED", + log.message, + ) + + def test_timeout_set_correctly(self): add_TraceServiceServicer_to_server( TraceServiceServicerWithExportParams( - StatusCode.UNAVAILABLE, - optional_export_sleep=None, - optional_export_retry_millis=1e7, + code_pb2.OK, optional_export_sleep=0.5 ), self.server, ) + # Should timeout. Deadline should be set to now + timeout. + # That is 400 millis from now, and export sleeps for 500 millis. with self.assertLogs(level=WARNING) as warning: self.assertEqual( - self.exporter.export([self.span]), SpanExportResult.FAILURE + self.exporter.export([self.span], 400), + SpanExportResult.FAILURE, ) - mock_sleep.assert_called_with(0.01) - self.assertEqual( - warning.records[0].message, - ( - "Transient error StatusCode.UNAVAILABLE encountered " - "while exporting traces to localhost:4317, retrying in 0.01s." - ), + "Failed to export traces to localhost:4317, error code: StatusCode.DEADLINE_EXCEEDED", + warning.records[-1].message, ) - def test_success(self): - add_TraceServiceServicer_to_server( - TraceServiceServicerWithExportParams(StatusCode.OK), - self.server, - ) self.assertEqual( - self.exporter.export([self.span]), SpanExportResult.SUCCESS + self.exporter.export([self.span], 600), + SpanExportResult.SUCCESS, ) def test_otlp_headers_from_env(self): @@ -431,15 +449,13 @@ def test_otlp_headers_from_env(self): def test_permanent_failure(self): with self.assertLogs(level=WARNING) as warning: add_TraceServiceServicer_to_server( - TraceServiceServicerWithExportParams( - StatusCode.ALREADY_EXISTS - ), + TraceServiceServicerWithExportParams(code_pb2.ALREADY_EXISTS), self.server, ) self.assertEqual( self.exporter.export([self.span]), SpanExportResult.FAILURE ) self.assertEqual( - warning.records[0].message, + warning.records[-1].message, "Failed to export traces to localhost:4317, error code: StatusCode.ALREADY_EXISTS", ) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py index 2ea12f660fb..ceda6e72a8e 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py @@ -18,7 +18,7 @@ from os.path import dirname from typing import List from unittest import TestCase -from unittest.mock import patch +from unittest.mock import ANY, patch from grpc import ChannelCredentials, Compression @@ -297,7 +297,9 @@ def test_otlp_exporter_otlp_compression_kwarg(self, mock_insecure_channel): insecure=True, compression=Compression.NoCompression ) mock_insecure_channel.assert_called_once_with( - "localhost:4317", compression=Compression.NoCompression + "localhost:4317", + compression=Compression.NoCompression, + options=ANY, ) def test_split_metrics_data_many_data_points(self): diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py index 73d8d6c7a20..ea39a7792d4 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py @@ -16,7 +16,7 @@ import os from unittest import TestCase -from unittest.mock import Mock, PropertyMock, patch +from unittest.mock import Mock, PropertyMock, patch, ANY from grpc import ChannelCredentials, Compression @@ -333,7 +333,9 @@ def test_otlp_exporter_otlp_compression_kwarg(self, mock_insecure_channel): """Specifying kwarg should take precedence over env""" OTLPSpanExporter(insecure=True, compression=Compression.NoCompression) mock_insecure_channel.assert_called_once_with( - "localhost:4317", compression=Compression.NoCompression + "localhost:4317", + compression=Compression.NoCompression, + options=ANY, ) # pylint: disable=no-self-use @@ -350,7 +352,9 @@ def test_otlp_exporter_otlp_compression_precendence( """ OTLPSpanExporter(insecure=True) mock_insecure_channel.assert_called_once_with( - "localhost:4317", compression=Compression.Gzip + "localhost:4317", + compression=Compression.Gzip, + options=ANY, ) def test_translate_spans(self): diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py index 21b877380c8..4662c8e4d55 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py @@ -17,14 +17,11 @@ import zlib from io import BytesIO from os import environ -from time import sleep +from time import sleep, time from typing import Dict, Optional, Sequence import requests -from opentelemetry.exporter.otlp.proto.common._internal import ( - _create_exp_backoff_generator, -) from opentelemetry.exporter.otlp.proto.common._log_encoder import encode_logs from opentelemetry.exporter.otlp.proto.http import ( _OTLP_HTTP_HEADERS, @@ -63,8 +60,6 @@ class OTLPLogExporter(LogExporter): - _MAX_RETRY_TIMEOUT = 64 - def __init__( self, endpoint: Optional[str] = None, @@ -72,7 +67,7 @@ def __init__( client_key_file: Optional[str] = None, client_certificate_file: Optional[str] = None, headers: Optional[Dict[str, str]] = None, - timeout: Optional[int] = None, + timeout: Optional[float] = None, compression: Optional[Compression] = None, session: Optional[requests.Session] = None, ): @@ -107,7 +102,7 @@ def __init__( self._headers = headers or parse_env_headers( headers_string, liberal=True ) - self._timeout = timeout or int( + self._timeout = timeout or float( environ.get( OTEL_EXPORTER_OTLP_LOGS_TIMEOUT, environ.get(OTEL_EXPORTER_OTLP_TIMEOUT, DEFAULT_TIMEOUT), @@ -123,7 +118,7 @@ def __init__( ) self._shutdown = False - def _export(self, serialized_data: bytes): + def _export(self, serialized_data: bytes, timeout_sec: float): data = serialized_data if self._compression == Compression.Gzip: gzip_data = BytesIO() @@ -137,7 +132,7 @@ def _export(self, serialized_data: bytes): url=self._endpoint, data=data, verify=self._certificate_file, - timeout=self._timeout, + timeout=timeout_sec, cert=self._client_cert, ) @@ -149,7 +144,9 @@ def _retryable(resp: requests.Response) -> bool: return True return False - def export(self, batch: Sequence[LogData]) -> LogExportResult: + def export( + self, batch: Sequence[LogData], timeout_millis: Optional[float] = None + ) -> LogExportResult: # After the call to Shutdown subsequent calls to Export are # not allowed and should return a Failure result. if self._shutdown: @@ -157,18 +154,20 @@ def export(self, batch: Sequence[LogData]) -> LogExportResult: return LogExportResult.FAILURE serialized_data = encode_logs(batch).SerializeToString() - - for delay in _create_exp_backoff_generator( - max_value=self._MAX_RETRY_TIMEOUT - ): - if delay == self._MAX_RETRY_TIMEOUT: + deadline_sec = time() + ( + timeout_millis / 1e3 if timeout_millis else self._timeout + ) + for delay in [1, 2, 4, 8, 16, 32]: + remaining_time_sec = deadline_sec - time() + if remaining_time_sec < 1e-09: return LogExportResult.FAILURE - - resp = self._export(serialized_data) + resp = self._export(serialized_data, remaining_time_sec) # pylint: disable=no-else-return if resp.ok: return LogExportResult.SUCCESS elif self._retryable(resp): + if delay > (deadline_sec - time()): + return LogExportResult.FAILURE _logger.warning( "Transient error %s encountered while exporting logs batch, retrying in %ss.", resp.reason, diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py index 00f429e4c97..46e7d3b84b6 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py @@ -17,13 +17,14 @@ import zlib from io import BytesIO from os import environ -from time import sleep +from time import sleep, time from typing import ( # noqa: F401 Any, Callable, Dict, List, Mapping, + Optional, Sequence, ) @@ -31,7 +32,6 @@ from deprecated import deprecated from opentelemetry.exporter.otlp.proto.common._internal import ( - _create_exp_backoff_generator, _get_resource_data, ) from opentelemetry.exporter.otlp.proto.common._internal.metrics_encoder import ( @@ -100,8 +100,6 @@ class OTLPMetricExporter(MetricExporter, OTLPMetricExporterMixin): - _MAX_RETRY_TIMEOUT = 64 - def __init__( self, endpoint: str | None = None, @@ -109,7 +107,7 @@ def __init__( client_key_file: str | None = None, client_certificate_file: str | None = None, headers: dict[str, str] | None = None, - timeout: int | None = None, + timeout: float | None = None, compression: Compression | None = None, session: requests.Session | None = None, preferred_temporality: dict[type, AggregationTemporality] @@ -146,7 +144,7 @@ def __init__( self._headers = headers or parse_env_headers( headers_string, liberal=True ) - self._timeout = timeout or int( + self._timeout = timeout or float( environ.get( OTEL_EXPORTER_OTLP_METRICS_TIMEOUT, environ.get(OTEL_EXPORTER_OTLP_TIMEOUT, DEFAULT_TIMEOUT), @@ -165,7 +163,7 @@ def __init__( preferred_temporality, preferred_aggregation ) - def _export(self, serialized_data: bytes): + def _export(self, serialized_data: bytes, timeout_sec: float): data = serialized_data if self._compression == Compression.Gzip: gzip_data = BytesIO() @@ -179,7 +177,7 @@ def _export(self, serialized_data: bytes): url=self._endpoint, data=data, verify=self._certificate_file, - timeout=self._timeout, + timeout=timeout_sec, cert=self._client_cert, ) @@ -194,21 +192,26 @@ def _retryable(resp: requests.Response) -> bool: def export( self, metrics_data: MetricsData, - timeout_millis: float = 10_000, + timeout_millis: Optional[float] = None, **kwargs, ) -> MetricExportResult: serialized_data = encode_metrics(metrics_data) - for delay in _create_exp_backoff_generator( - max_value=self._MAX_RETRY_TIMEOUT - ): - if delay == self._MAX_RETRY_TIMEOUT: + deadline_sec = time() + ( + timeout_millis / 1e3 if timeout_millis else self._timeout + ) + for delay in [1, 2, 4, 8, 16, 32]: + remaining_time_sec = deadline_sec - time() + if remaining_time_sec < 1e-09: return MetricExportResult.FAILURE - - resp = self._export(serialized_data.SerializeToString()) + resp = self._export( + serialized_data.SerializeToString(), remaining_time_sec + ) # pylint: disable=no-else-return if resp.ok: return MetricExportResult.SUCCESS elif self._retryable(resp): + if delay > (deadline_sec - time()): + return MetricExportResult.FAILURE _logger.warning( "Transient error %s encountered while exporting metric batch, retrying in %ss.", resp.reason, diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py index 7bcf4b4ced1..0c913df0e88 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py @@ -17,14 +17,11 @@ import zlib from io import BytesIO from os import environ -from time import sleep -from typing import Dict, Optional +from time import sleep, time +from typing import Dict, Optional, Sequence import requests -from opentelemetry.exporter.otlp.proto.common._internal import ( - _create_exp_backoff_generator, -) from opentelemetry.exporter.otlp.proto.common.trace_encoder import ( encode_spans, ) @@ -48,6 +45,7 @@ OTEL_EXPORTER_OTLP_TRACES_HEADERS, OTEL_EXPORTER_OTLP_TRACES_TIMEOUT, ) +from opentelemetry.sdk.trace import ReadableSpan from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult from opentelemetry.util.re import parse_env_headers @@ -61,8 +59,6 @@ class OTLPSpanExporter(SpanExporter): - _MAX_RETRY_TIMEOUT = 64 - def __init__( self, endpoint: Optional[str] = None, @@ -70,7 +66,7 @@ def __init__( client_key_file: Optional[str] = None, client_certificate_file: Optional[str] = None, headers: Optional[Dict[str, str]] = None, - timeout: Optional[int] = None, + timeout: Optional[float] = None, compression: Optional[Compression] = None, session: Optional[requests.Session] = None, ): @@ -104,7 +100,7 @@ def __init__( self._headers = headers or parse_env_headers( headers_string, liberal=True ) - self._timeout = timeout or int( + self._timeout = timeout or float( environ.get( OTEL_EXPORTER_OTLP_TRACES_TIMEOUT, environ.get(OTEL_EXPORTER_OTLP_TIMEOUT, DEFAULT_TIMEOUT), @@ -120,7 +116,7 @@ def __init__( ) self._shutdown = False - def _export(self, serialized_data: bytes): + def _export(self, serialized_data: bytes, timeout_sec: float): data = serialized_data if self._compression == Compression.Gzip: gzip_data = BytesIO() @@ -134,7 +130,7 @@ def _export(self, serialized_data: bytes): url=self._endpoint, data=data, verify=self._certificate_file, - timeout=self._timeout, + timeout=timeout_sec, cert=self._client_cert, ) @@ -146,21 +142,32 @@ def _retryable(resp: requests.Response) -> bool: return True return False - def _serialize_spans(self, spans): - return encode_spans(spans).SerializePartialToString() + def export( + self, + spans: Sequence[ReadableSpan], + timeout_millis: Optional[float] = None, + ) -> SpanExportResult: + # After the call to Shutdown subsequent calls to Export are + # not allowed and should return a Failure result. + if self._shutdown: + _logger.warning("Exporter already shutdown, ignoring batch") + return SpanExportResult.FAILURE - def _export_serialized_spans(self, serialized_data): - for delay in _create_exp_backoff_generator( - max_value=self._MAX_RETRY_TIMEOUT - ): - if delay == self._MAX_RETRY_TIMEOUT: + serialized_data = encode_spans(spans).SerializePartialToString() + deadline_sec = time() + ( + timeout_millis / 1e3 if timeout_millis else self._timeout + ) + for delay in [1, 2, 4, 8, 16, 32]: + remaining_time_sec = deadline_sec - time() + if remaining_time_sec < 1e-09: return SpanExportResult.FAILURE - - resp = self._export(serialized_data) + resp = self._export(serialized_data, remaining_time_sec) # pylint: disable=no-else-return if resp.ok: return SpanExportResult.SUCCESS elif self._retryable(resp): + if delay > (deadline_sec - time()): + return SpanExportResult.FAILURE _logger.warning( "Transient error %s encountered while exporting span batch, retrying in %ss.", resp.reason, @@ -177,17 +184,6 @@ def _export_serialized_spans(self, serialized_data): return SpanExportResult.FAILURE return SpanExportResult.FAILURE - def export(self, spans) -> SpanExportResult: - # After the call to Shutdown subsequent calls to Export are - # not allowed and should return a Failure result. - if self._shutdown: - _logger.warning("Exporter already shutdown, ignoring batch") - return SpanExportResult.FAILURE - - serialized_data = self._serialize_spans(spans) - - return self._export_serialized_spans(serialized_data) - def shutdown(self): if self._shutdown: _logger.warning("Exporter already shutdown, ignoring call") diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py index 16bb3e54286..df7c0c17ea3 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py @@ -15,11 +15,10 @@ from logging import WARNING from os import environ from unittest import TestCase -from unittest.mock import MagicMock, Mock, call, patch +from unittest.mock import ANY, MagicMock, Mock, patch from requests import Session from requests.models import Response -from responses import POST, activate, add from opentelemetry.exporter.otlp.proto.common.metrics_encoder import ( encode_metrics, @@ -327,31 +326,10 @@ def test_serialization(self, mock_post): url=exporter._endpoint, data=serialized_data.SerializeToString(), verify=exporter._certificate_file, - timeout=exporter._timeout, + timeout=ANY, # Timeout is a float based on real time, can't put an exact value here. cert=exporter._client_cert, ) - @activate - @patch("opentelemetry.exporter.otlp.proto.http.metric_exporter.sleep") - def test_exponential_backoff(self, mock_sleep): - # return a retryable error - add( - POST, - "http://metrics.example.com/export", - json={"error": "something exploded"}, - status=500, - ) - - exporter = OTLPMetricExporter( - endpoint="http://metrics.example.com/export" - ) - metrics_data = self.metrics["sum_int"] - - exporter.export(metrics_data) - mock_sleep.assert_has_calls( - [call(1), call(2), call(4), call(8), call(16), call(32)] - ) - def test_aggregation_temporality(self): otlp_metric_exporter = OTLPMetricExporter() @@ -523,3 +501,47 @@ def test_preferred_aggregation_override(self): self.assertEqual( exporter._preferred_aggregation[Histogram], histogram_aggregation ) + + @patch.object(Session, "post") + def test_retry_timeout(self, mock_post): + exporter = OTLPMetricExporter(timeout=3.5) + + resp = Response() + resp.status_code = 503 + resp.reason = "UNAVAILABLE" + mock_post.return_value = resp + with self.assertLogs(level=WARNING) as warning: + # Set timeout to 1.5 seconds + self.assertEqual( + exporter.export(self.metrics["sum_int"], 1500), + MetricExportResult.FAILURE, + ) + # Code should return failure before the final retry which would exceed timeout. + # Code should return failure after retrying once. + self.assertEqual(len(warning.records), 1) + self.assertEqual( + "Transient error UNAVAILABLE encountered while exporting metric batch, retrying in 1s.", + warning.records[0].message, + ) + with self.assertLogs(level=WARNING) as warning: + # This time don't pass in a timeout, so it will fallback to 3.5 second set on class. + self.assertEqual( + exporter.export(self.metrics["sum_int"]), + MetricExportResult.FAILURE, + ) + # 2 retrys (after 1s, 3s). + self.assertEqual(len(warning.records), 2) + + @patch.object(Session, "post") + def test_timeout_set_correctly(self, mock_post): + resp = Response() + resp.status_code = 200 + + def export_side_effect(*args, **kwargs): + # Timeout should be set to something slightly less than 400 milliseconds depending on how much time has passed. + self.assertTrue(0.4 - kwargs["timeout"] < 0.0005) + return resp + + mock_post.side_effect = export_side_effect + exporter = OTLPMetricExporter() + exporter.export(self.metrics["sum_int"], 400) diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py index 66b0f890d76..00a00ae3aa9 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py @@ -15,12 +15,14 @@ # pylint: disable=protected-access import unittest +from logging import WARNING from typing import List -from unittest.mock import MagicMock, Mock, call, patch +from unittest.mock import MagicMock, Mock, patch import requests -import responses from google.protobuf.json_format import MessageToDict +from requests import Session +from requests.models import Response from opentelemetry._logs import SeverityNumber from opentelemetry.exporter.otlp.proto.http import Compression @@ -267,25 +269,6 @@ def test_exported_log_without_span_id(self): else: self.fail("No log records found") - @responses.activate - @patch("opentelemetry.exporter.otlp.proto.http._log_exporter.sleep") - def test_exponential_backoff(self, mock_sleep): - # return a retryable error - responses.add( - responses.POST, - "http://logs.example.com/export", - json={"error": "something exploded"}, - status=500, - ) - - exporter = OTLPLogExporter(endpoint="http://logs.example.com/export") - logs = self._get_sdk_log_data() - - exporter.export(logs) - mock_sleep.assert_has_calls( - [call(1), call(2), call(4), call(8), call(16), call(32)] - ) - @staticmethod def _get_sdk_log_data() -> List[LogData]: log1 = LogData( @@ -365,3 +348,46 @@ def test_2xx_status_code(self, mock_otlp_metric_exporter): self.assertEqual( OTLPLogExporter().export(MagicMock()), LogExportResult.SUCCESS ) + + @patch.object(Session, "post") + def test_retry_timeout(self, mock_post): + exporter = OTLPLogExporter(timeout=3.5) + + resp = Response() + resp.status_code = 503 + resp.reason = "UNAVAILABLE" + mock_post.return_value = resp + with self.assertLogs(level=WARNING) as warning: + # Set timeout to 1.5 seconds + self.assertEqual( + exporter.export(self._get_sdk_log_data(), 1500), + LogExportResult.FAILURE, + ) + # Code should return failure after retrying once. + self.assertEqual(len(warning.records), 1) + self.assertEqual( + "Transient error UNAVAILABLE encountered while exporting logs batch, retrying in 1s.", + warning.records[0].message, + ) + with self.assertLogs(level=WARNING) as warning: + # This time don't pass in a timeout, so it will fallback to 3.5 second set on class. + self.assertEqual( + exporter.export(self._get_sdk_log_data()), + LogExportResult.FAILURE, + ) + # 2 retrys (after 1s, 3s). + self.assertEqual(len(warning.records), 2) + + @patch.object(Session, "post") + def test_timeout_set_correctly(self, mock_post): + resp = Response() + resp.status_code = 200 + + def export_side_effect(*args, **kwargs): + # Timeout should be set to something slightly less than 400 milliseconds depending on how much time has passed. + self.assertTrue(0.4 - kwargs["timeout"] < 0.0005) + return resp + + mock_post.side_effect = export_side_effect + exporter = OTLPLogExporter() + exporter.export(self._get_sdk_log_data(), 400) diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py index 8d8ff6037aa..b7e357bbe4c 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py @@ -13,10 +13,12 @@ # limitations under the License. import unittest -from unittest.mock import MagicMock, Mock, call, patch +from logging import WARNING +from unittest.mock import MagicMock, Mock, patch import requests -import responses +from requests import Session +from requests.models import Response from opentelemetry.exporter.otlp.proto.http import Compression from opentelemetry.exporter.otlp.proto.http.trace_exporter import ( @@ -52,6 +54,16 @@ OS_ENV_CLIENT_KEY = "os/env/client-key.pem" OS_ENV_HEADERS = "envHeader1=val1,envHeader2=val2" OS_ENV_TIMEOUT = "30" +BASIC_SPAN = _Span( + "abc", + context=Mock( + **{ + "trace_state": {"a": "b", "c": "d"}, + "span_id": 10217189687419569865, + "trace_id": 67545097771067222548457157018666467027, + } + ), +) # pylint: disable=protected-access @@ -227,37 +239,6 @@ def test_headers_parse_from_env(self): ), ) - # pylint: disable=no-self-use - @responses.activate - @patch("opentelemetry.exporter.otlp.proto.http.trace_exporter.sleep") - def test_exponential_backoff(self, mock_sleep): - # return a retryable error - responses.add( - responses.POST, - "http://traces.example.com/export", - json={"error": "something exploded"}, - status=500, - ) - - exporter = OTLPSpanExporter( - endpoint="http://traces.example.com/export" - ) - span = _Span( - "abc", - context=Mock( - **{ - "trace_state": {"a": "b", "c": "d"}, - "span_id": 10217189687419569865, - "trace_id": 67545097771067222548457157018666467027, - } - ), - ) - - exporter.export([span]) - mock_sleep.assert_has_calls( - [call(1), call(2), call(4), call(8), call(16), call(32)] - ) - @patch.object(OTLPSpanExporter, "_export", return_value=Mock(ok=True)) def test_2xx_status_code(self, mock_otlp_metric_exporter): """ @@ -267,3 +248,46 @@ def test_2xx_status_code(self, mock_otlp_metric_exporter): self.assertEqual( OTLPSpanExporter().export(MagicMock()), SpanExportResult.SUCCESS ) + + @patch.object(Session, "post") + def test_retry_timeout(self, mock_post): + exporter = OTLPSpanExporter(timeout=3.5) + + resp = Response() + resp.status_code = 503 + resp.reason = "UNAVAILABLE" + mock_post.return_value = resp + with self.assertLogs(level=WARNING) as warning: + # Set timeout to 1.5 seconds + self.assertEqual( + exporter.export([BASIC_SPAN], 1500), + SpanExportResult.FAILURE, + ) + # Code should return failure after retrying once. + self.assertEqual(len(warning.records), 1) + self.assertEqual( + "Transient error UNAVAILABLE encountered while exporting span batch, retrying in 1s.", + warning.records[0].message, + ) + with self.assertLogs(level=WARNING) as warning: + # This time don't pass in a timeout, so it will fallback to 3.5 second set on class. + self.assertEqual( + exporter.export([BASIC_SPAN]), + SpanExportResult.FAILURE, + ) + # 2 retrys (after 1s, 3s). + self.assertEqual(len(warning.records), 2) + + @patch.object(Session, "post") + def test_timeout_set_correctly(self, mock_post): + resp = Response() + resp.status_code = 200 + + def export_side_effect(*args, **kwargs): + # Timeout should be set to something slightly less than 400 milliseconds depending on how much time has passed. + self.assertTrue(0.4 - kwargs["timeout"] < 0.0005) + return resp + + mock_post.side_effect = export_side_effect + exporter = OTLPSpanExporter() + exporter.export([BASIC_SPAN], 400) diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/environment_variables/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/environment_variables/__init__.py index 4f69143084c..e6e91e84c6a 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/environment_variables/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/environment_variables/__init__.py @@ -333,7 +333,7 @@ """ .. envvar:: OTEL_EXPORTER_OTLP_TIMEOUT -The :envvar:`OTEL_EXPORTER_OTLP_TIMEOUT` is the maximum time the OTLP exporter will wait for each batch export. +The :envvar:`OTEL_EXPORTER_OTLP_TIMEOUT` is the maximum number of seconds the OTLP exporter will wait for each batch export. Default: 10 """ @@ -535,7 +535,7 @@ """ .. envvar:: OTEL_EXPORTER_OTLP_TRACES_TIMEOUT -The :envvar:`OTEL_EXPORTER_OTLP_TRACES_TIMEOUT` is the maximum time the OTLP exporter will +The :envvar:`OTEL_EXPORTER_OTLP_TRACES_TIMEOUT` is the maximum number of seconds the OTLP exporter will wait for each batch export for spans. """ @@ -543,7 +543,7 @@ """ .. envvar:: OTEL_EXPORTER_OTLP_METRICS_TIMEOUT -The :envvar:`OTEL_EXPORTER_OTLP_METRICS_TIMEOUT` is the maximum time the OTLP exporter will +The :envvar:`OTEL_EXPORTER_OTLP_METRICS_TIMEOUT` is the maximum number of seconds the OTLP exporter will wait for each batch export for metrics. """ @@ -577,7 +577,7 @@ """ .. envvar:: OTEL_EXPORTER_OTLP_LOGS_TIMEOUT -The :envvar:`OTEL_EXPORTER_OTLP_LOGS_TIMEOUT` is the maximum time the OTLP exporter will +The :envvar:`OTEL_EXPORTER_OTLP_LOGS_TIMEOUT` is the maximum number of seconds the OTLP exporter will wait for each batch export for logs. """ From e0cf233b176bcc24a0a92066a0783a55e55ceb0f Mon Sep 17 00:00:00 2001 From: Jay Clifford <45856600+Jayclifford345@users.noreply.github.com> Date: Wed, 23 Apr 2025 15:16:47 +0100 Subject: [PATCH 02/21] feat: Updated and added examples (logs and metrics) (#4559) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * added examples * Apply suggestions from code review Co-authored-by: Emídio Neto <9735060+emdneto@users.noreply.github.com> * feat: added examples for metrics and logs * fixed spelling * Update docs/examples/metrics/reader/README.rst Co-authored-by: Emídio Neto <9735060+emdneto@users.noreply.github.com> --------- Co-authored-by: Emídio Neto <9735060+emdneto@users.noreply.github.com> --- CHANGELOG.md | 2 + docs/examples/logs/README.rst | 101 ++++++++++++------ docs/examples/logs/example.py | 4 + docs/examples/logs/otel-collector-config.yaml | 2 +- docs/examples/metrics/reader/README.rst | 1 + .../metrics/reader/synchronous_gauge_read.py | 88 +++++++++++++++ 6 files changed, 163 insertions(+), 35 deletions(-) create mode 100644 docs/examples/metrics/reader/synchronous_gauge_read.py diff --git a/CHANGELOG.md b/CHANGELOG.md index d9e89b18034..256312c8c44 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fix serialization of extended attributes for logs signal ([#4342](https://github.com/open-telemetry/opentelemetry-python/pull/4342)) +- docs: updated and added to the metrics and log examples + ([#4559](https://github.com/open-telemetry/opentelemetry-python/pull/4559)) ## Version 1.32.0/0.53b0 (2025-04-10) diff --git a/docs/examples/logs/README.rst b/docs/examples/logs/README.rst index b61114733ec..e3cd86362b2 100644 --- a/docs/examples/logs/README.rst +++ b/docs/examples/logs/README.rst @@ -52,37 +52,70 @@ The resulting logs will appear in the output from the collector and look similar .. code-block:: sh - Resource SchemaURL: - Resource labels: - -> telemetry.sdk.language: STRING(python) - -> telemetry.sdk.name: STRING(opentelemetry) - -> telemetry.sdk.version: STRING(1.8.0) - -> service.name: STRING(shoppingcart) - -> service.instance.id: STRING(instance-12) - InstrumentationLibraryLogs #0 - InstrumentationLibraryMetrics SchemaURL: - InstrumentationLibrary __main__ 0.1 - LogRecord #0 - Timestamp: 2022-01-13 20:37:03.998733056 +0000 UTC - Severity: WARNING - ShortName: - Body: Jail zesty vixen who grabbed pay from quack. - Trace ID: - Span ID: - Flags: 0 - LogRecord #1 - Timestamp: 2022-01-13 20:37:04.082757888 +0000 UTC - Severity: ERROR - ShortName: - Body: The five boxing wizards jump quickly. - Trace ID: - Span ID: - Flags: 0 - LogRecord #2 - Timestamp: 2022-01-13 20:37:04.082979072 +0000 UTC - Severity: ERROR - ShortName: - Body: Hyderabad, we have a major problem. - Trace ID: 63491217958f126f727622e41d4460f3 - Span ID: d90c57d6e1ca4f6c - Flags: 1 + ResourceLog #0 + Resource SchemaURL: + Resource attributes: + -> telemetry.sdk.language: Str(python) + -> telemetry.sdk.name: Str(opentelemetry) + -> telemetry.sdk.version: Str(1.33.0.dev0) + -> service.name: Str(shoppingcart) + -> service.instance.id: Str(instance-12) + ScopeLogs #0 + ScopeLogs SchemaURL: + InstrumentationScope myapp.area2 + LogRecord #0 + ObservedTimestamp: 2025-04-22 12:16:57.315179 +0000 UTC + Timestamp: 2025-04-22 12:16:57.315152896 +0000 UTC + SeverityText: WARN + SeverityNumber: Warn(13) + Body: Str(Jail zesty vixen who grabbed pay from quack.) + Attributes: + -> code.filepath: Str(/Users/jayclifford/Repos/opentelemetry-python/docs/examples/logs/example.py) + -> code.function: Str() + -> code.lineno: Int(47) + Trace ID: + Span ID: + Flags: 0 + LogRecord #1 + ObservedTimestamp: 2025-04-22 12:16:57.31522 +0000 UTC + Timestamp: 2025-04-22 12:16:57.315213056 +0000 UTC + SeverityText: ERROR + SeverityNumber: Error(17) + Body: Str(The five boxing wizards jump quickly.) + Attributes: + -> code.filepath: Str(/Users/jayclifford/Repos/opentelemetry-python/docs/examples/logs/example.py) + -> code.function: Str() + -> code.lineno: Int(48) + Trace ID: + Span ID: + Flags: 0 + LogRecord #2 + ObservedTimestamp: 2025-04-22 12:16:57.315445 +0000 UTC + Timestamp: 2025-04-22 12:16:57.31543808 +0000 UTC + SeverityText: ERROR + SeverityNumber: Error(17) + Body: Str(Hyderabad, we have a major problem.) + Attributes: + -> code.filepath: Str(/Users/jayclifford/Repos/opentelemetry-python/docs/examples/logs/example.py) + -> code.function: Str() + -> code.lineno: Int(61) + Trace ID: 8a6739fffce895e694700944e2faf23e + Span ID: a45337020100cb63 + Flags: 1 + ScopeLogs #1 + ScopeLogs SchemaURL: + InstrumentationScope myapp.area1 + LogRecord #0 + ObservedTimestamp: 2025-04-22 12:16:57.315242 +0000 UTC + Timestamp: 2025-04-22 12:16:57.315234048 +0000 UTC + SeverityText: ERROR + SeverityNumber: Error(17) + Body: Str(I have custom attributes.) + Attributes: + -> user_id: Str(user-123) + -> code.filepath: Str(/Users/jayclifford/Repos/opentelemetry-python/docs/examples/logs/example.py) + -> code.function: Str() + -> code.lineno: Int(53) + Trace ID: + Span ID: + Flags: 0 diff --git a/docs/examples/logs/example.py b/docs/examples/logs/example.py index ba471ea7e69..c782d457533 100644 --- a/docs/examples/logs/example.py +++ b/docs/examples/logs/example.py @@ -47,6 +47,10 @@ logger2.warning("Jail zesty vixen who grabbed pay from quack.") logger2.error("The five boxing wizards jump quickly.") +# Log custom attributes +# Custom attributes are added on a per event basis +user_id = "user-123" +logger1.error("I have custom attributes.", extra={"user_id": user_id}) # Trace context correlation tracer = trace.get_tracer(__name__) diff --git a/docs/examples/logs/otel-collector-config.yaml b/docs/examples/logs/otel-collector-config.yaml index 50d29086415..64495c75091 100644 --- a/docs/examples/logs/otel-collector-config.yaml +++ b/docs/examples/logs/otel-collector-config.yaml @@ -6,7 +6,7 @@ receivers: exporters: debug: - verbosity: debug + verbosity: detailed processors: batch: diff --git a/docs/examples/metrics/reader/README.rst b/docs/examples/metrics/reader/README.rst index 4822fe77669..01a913f22a3 100644 --- a/docs/examples/metrics/reader/README.rst +++ b/docs/examples/metrics/reader/README.rst @@ -6,6 +6,7 @@ These examples show how to customize the metrics that are output by the SDK usin * preferred_aggregation.py: Shows how to configure the preferred aggregation for metric instrument types. * preferred_temporality.py: Shows how to configure the preferred temporality for metric instrument types. * preferred_exemplarfilter.py: Shows how to configure the exemplar filter. +* synchronous_gauge_read.py: Shows how to use `PeriodicExportingMetricReader` in a synchronous manner to explicitly control the collection of metrics. The source files of these examples are available :scm_web:`here `. diff --git a/docs/examples/metrics/reader/synchronous_gauge_read.py b/docs/examples/metrics/reader/synchronous_gauge_read.py new file mode 100644 index 00000000000..d45f7ff00da --- /dev/null +++ b/docs/examples/metrics/reader/synchronous_gauge_read.py @@ -0,0 +1,88 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from typing import Iterable + +from opentelemetry.metrics import ( + CallbackOptions, + Observation, + get_meter_provider, + set_meter_provider, +) +from opentelemetry.sdk.metrics import MeterProvider +from opentelemetry.sdk.metrics.export import ( + ConsoleMetricExporter, + PeriodicExportingMetricReader, +) + +temperature = 0.0 +humidity = 0.0 + + +# Function called by the gauge to read the temperature +def read_temperature(options: CallbackOptions) -> Iterable[Observation]: + global temperature + yield Observation(value=temperature, attributes={"room": "living-room"}) + + +# Function called by the gauge to read the humidity +def read_humidity(options: CallbackOptions) -> Iterable[Observation]: + global humidity + yield Observation(value=humidity, attributes={"room": "living-room"}) + + +# Use console exporter for the example +exporter = ConsoleMetricExporter() + +# The PeriodicExportingMetricReader If the time interval is set to math.inf +# the reader will not invoke periodic collection +reader = PeriodicExportingMetricReader( + exporter, + export_interval_millis=math.inf, +) + +provider = MeterProvider(metric_readers=[reader]) +set_meter_provider(provider) + +meter = get_meter_provider().get_meter("synchronous_read", "0.1.2") + +gauge = meter.create_observable_gauge( + name="synchronous_gauge_temperature", + description="Gauge value captured synchronously", + callbacks=[read_temperature], +) + +# Simulate synchronous reading of temperature +print("--- Simulating synchronous reading of temperature ---", flush=True) +temperature = 25.0 +reader.collect() +# Note: The reader will only collect the last value before `collect` is called +print("--- Last value only ---", flush=True) +temperature = 30.0 +temperature = 35.0 +reader.collect() +# Invoking `collect` will read all measurements assigned to the reader +gauge2 = meter.create_observable_gauge( + name="synchronous_gauge_humidity", + description="Gauge value captured synchronously", + callbacks=[read_humidity], +) +print("--- Multiple Measurements ---", flush=True) +temperature = 20.0 +humidity = 50.0 +reader.collect() +# Invoking `force_flush` will read all measurements assigned to the reader +print("--- Invoking force_flush ---", flush=True) +provider.force_flush() From 1b1e8d80c764ad3aa76abfb56a7002ddea11fdb5 Mon Sep 17 00:00:00 2001 From: Riccardo Magliocchetti Date: Wed, 23 Apr 2025 16:36:16 +0200 Subject: [PATCH 03/21] opentelemetry-sdk: use stable code attributes (#4508) --- CHANGELOG.md | 2 + .../sdk/_logs/_internal/__init__.py | 21 ++++++----- opentelemetry-sdk/tests/logs/test_handler.py | 37 +++++++++++-------- 3 files changed, 35 insertions(+), 25 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 256312c8c44..cc7a26bb789 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +- opentelemetry-sdk: use stable code attributes: `code.function` -> `code.function.name`, `code.lineno` -> `code.line.number`, `code.filepath` -> `code.file.path` + ([#4508](https://github.com/open-telemetry/opentelemetry-python/pull/4508)) - Fix serialization of extended attributes for logs signal ([#4342](https://github.com/open-telemetry/opentelemetry-python/pull/4342)) - docs: updated and added to the metrics and log examples diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/__init__.py index 58872f68020..9060e49aac4 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/__init__.py @@ -45,7 +45,8 @@ from opentelemetry.sdk.resources import Resource from opentelemetry.sdk.util import ns_to_iso_str from opentelemetry.sdk.util.instrumentation import InstrumentationScope -from opentelemetry.semconv.trace import SpanAttributes +from opentelemetry.semconv._incubating.attributes import code_attributes +from opentelemetry.semconv.attributes import exception_attributes from opentelemetry.trace import ( format_span_id, format_trace_id, @@ -487,22 +488,24 @@ def _get_attributes(record: logging.LogRecord) -> _ExtendedAttributes: } # Add standard code attributes for logs. - attributes[SpanAttributes.CODE_FILEPATH] = record.pathname - attributes[SpanAttributes.CODE_FUNCTION] = record.funcName - attributes[SpanAttributes.CODE_LINENO] = record.lineno + attributes[code_attributes.CODE_FILE_PATH] = record.pathname + attributes[code_attributes.CODE_FUNCTION_NAME] = record.funcName + attributes[code_attributes.CODE_LINE_NUMBER] = record.lineno if record.exc_info: exctype, value, tb = record.exc_info if exctype is not None: - attributes[SpanAttributes.EXCEPTION_TYPE] = exctype.__name__ + attributes[exception_attributes.EXCEPTION_TYPE] = ( + exctype.__name__ + ) if value is not None and value.args: - attributes[SpanAttributes.EXCEPTION_MESSAGE] = str( + attributes[exception_attributes.EXCEPTION_MESSAGE] = str( value.args[0] ) if tb is not None: - # https://github.com/open-telemetry/opentelemetry-specification/blob/9fa7c656b26647b27e485a6af7e38dc716eba98a/specification/trace/semantic_conventions/exceptions.md#stacktrace-representation - attributes[SpanAttributes.EXCEPTION_STACKTRACE] = "".join( - traceback.format_exception(*record.exc_info) + # https://opentelemetry.io/docs/specs/semconv/exceptions/exceptions-spans/#stacktrace-representation + attributes[exception_attributes.EXCEPTION_STACKTRACE] = ( + "".join(traceback.format_exception(*record.exc_info)) ) return attributes diff --git a/opentelemetry-sdk/tests/logs/test_handler.py b/opentelemetry-sdk/tests/logs/test_handler.py index 1b62cc6c788..3817c440258 100644 --- a/opentelemetry-sdk/tests/logs/test_handler.py +++ b/opentelemetry-sdk/tests/logs/test_handler.py @@ -27,7 +27,8 @@ LoggingHandler, LogRecordProcessor, ) -from opentelemetry.semconv.trace import SpanAttributes +from opentelemetry.semconv._incubating.attributes import code_attributes +from opentelemetry.semconv.attributes import exception_attributes from opentelemetry.trace import INVALID_SPAN_CONTEXT @@ -127,17 +128,19 @@ def test_log_record_user_attributes(self): self.assertEqual(len(log_record.attributes), 4) self.assertEqual(log_record.attributes["http.status_code"], 200) self.assertTrue( - log_record.attributes[SpanAttributes.CODE_FILEPATH].endswith( + log_record.attributes[code_attributes.CODE_FILE_PATH].endswith( "test_handler.py" ) ) self.assertEqual( - log_record.attributes[SpanAttributes.CODE_FUNCTION], + log_record.attributes[code_attributes.CODE_FUNCTION_NAME], "test_log_record_user_attributes", ) # The line of the log statement is not a constant (changing tests may change that), # so only check that the attribute is present. - self.assertTrue(SpanAttributes.CODE_LINENO in log_record.attributes) + self.assertTrue( + code_attributes.CODE_LINE_NUMBER in log_record.attributes + ) self.assertTrue(isinstance(log_record.attributes, BoundedAttributes)) def test_log_record_exception(self): @@ -156,15 +159,15 @@ def test_log_record_exception(self): self.assertTrue(isinstance(log_record.body, str)) self.assertEqual(log_record.body, "Zero Division Error") self.assertEqual( - log_record.attributes[SpanAttributes.EXCEPTION_TYPE], + log_record.attributes[exception_attributes.EXCEPTION_TYPE], ZeroDivisionError.__name__, ) self.assertEqual( - log_record.attributes[SpanAttributes.EXCEPTION_MESSAGE], + log_record.attributes[exception_attributes.EXCEPTION_MESSAGE], "division by zero", ) stack_trace = log_record.attributes[ - SpanAttributes.EXCEPTION_STACKTRACE + exception_attributes.EXCEPTION_STACKTRACE ] self.assertIsInstance(stack_trace, str) self.assertTrue("Traceback" in stack_trace) @@ -189,15 +192,15 @@ def test_log_record_recursive_exception(self): self.assertIsNotNone(log_record) self.assertEqual(log_record.body, "Zero Division Error") self.assertEqual( - log_record.attributes[SpanAttributes.EXCEPTION_TYPE], + log_record.attributes[exception_attributes.EXCEPTION_TYPE], ZeroDivisionError.__name__, ) self.assertEqual( - log_record.attributes[SpanAttributes.EXCEPTION_MESSAGE], + log_record.attributes[exception_attributes.EXCEPTION_MESSAGE], "division by zero", ) stack_trace = log_record.attributes[ - SpanAttributes.EXCEPTION_STACKTRACE + exception_attributes.EXCEPTION_STACKTRACE ] self.assertIsInstance(stack_trace, str) self.assertTrue("Traceback" in stack_trace) @@ -219,12 +222,14 @@ def test_log_exc_info_false(self): self.assertIsNotNone(log_record) self.assertEqual(log_record.body, "Zero Division Error") - self.assertNotIn(SpanAttributes.EXCEPTION_TYPE, log_record.attributes) self.assertNotIn( - SpanAttributes.EXCEPTION_MESSAGE, log_record.attributes + exception_attributes.EXCEPTION_TYPE, log_record.attributes + ) + self.assertNotIn( + exception_attributes.EXCEPTION_MESSAGE, log_record.attributes ) self.assertNotIn( - SpanAttributes.EXCEPTION_STACKTRACE, log_record.attributes + exception_attributes.EXCEPTION_STACKTRACE, log_record.attributes ) def test_log_record_exception_with_object_payload(self): @@ -246,15 +251,15 @@ def __str__(self): self.assertTrue(isinstance(log_record.body, str)) self.assertEqual(log_record.body, "CustomException stringified") self.assertEqual( - log_record.attributes[SpanAttributes.EXCEPTION_TYPE], + log_record.attributes[exception_attributes.EXCEPTION_TYPE], CustomException.__name__, ) self.assertEqual( - log_record.attributes[SpanAttributes.EXCEPTION_MESSAGE], + log_record.attributes[exception_attributes.EXCEPTION_MESSAGE], "CustomException message", ) stack_trace = log_record.attributes[ - SpanAttributes.EXCEPTION_STACKTRACE + exception_attributes.EXCEPTION_STACKTRACE ] self.assertIsInstance(stack_trace, str) self.assertTrue("Traceback" in stack_trace) From adbec5008b4b308ab03522d1caac532344a17199 Mon Sep 17 00:00:00 2001 From: Alex Boten <223565+codeboten@users.noreply.github.com> Date: Wed, 23 Apr 2025 13:16:09 -0700 Subject: [PATCH 04/21] bugfix(exporter): ensure response is closed (#4477) --- CHANGELOG.md | 2 ++ .../otlp/proto/http/_log_exporter/__init__.py | 29 ++++++++++++++----- .../proto/http/metric_exporter/__init__.py | 29 ++++++++++++++----- .../proto/http/trace_exporter/__init__.py | 29 ++++++++++++++----- 4 files changed, 68 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cc7a26bb789..f379f34e0de 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +- Fix intermittent `Connection aborted` error when using otlp/http exporters + ([#4477](https://github.com/open-telemetry/opentelemetry-python/pull/4477)) - opentelemetry-sdk: use stable code attributes: `code.function` -> `code.function.name`, `code.lineno` -> `code.line.number`, `code.filepath` -> `code.file.path` ([#4508](https://github.com/open-telemetry/opentelemetry-python/pull/4508)) - Fix serialization of extended attributes for logs signal diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py index 21b877380c8..f86f0113833 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py @@ -21,6 +21,7 @@ from typing import Dict, Optional, Sequence import requests +from requests.exceptions import ConnectionError from opentelemetry.exporter.otlp.proto.common._internal import ( _create_exp_backoff_generator, @@ -133,13 +134,27 @@ def _export(self, serialized_data: bytes): elif self._compression == Compression.Deflate: data = zlib.compress(serialized_data) - return self._session.post( - url=self._endpoint, - data=data, - verify=self._certificate_file, - timeout=self._timeout, - cert=self._client_cert, - ) + # By default, keep-alive is enabled in Session's request + # headers. Backends may choose to close the connection + # while a post happens which causes an unhandled + # exception. This try/except will retry the post on such exceptions + try: + resp = self._session.post( + url=self._endpoint, + data=data, + verify=self._certificate_file, + timeout=self._timeout, + cert=self._client_cert, + ) + except ConnectionError: + resp = self._session.post( + url=self._endpoint, + data=data, + verify=self._certificate_file, + timeout=self._timeout, + cert=self._client_cert, + ) + return resp @staticmethod def _retryable(resp: requests.Response) -> bool: diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py index 00f429e4c97..4feea8d4302 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py @@ -29,6 +29,7 @@ import requests from deprecated import deprecated +from requests.exceptions import ConnectionError from opentelemetry.exporter.otlp.proto.common._internal import ( _create_exp_backoff_generator, @@ -175,13 +176,27 @@ def _export(self, serialized_data: bytes): elif self._compression == Compression.Deflate: data = zlib.compress(serialized_data) - return self._session.post( - url=self._endpoint, - data=data, - verify=self._certificate_file, - timeout=self._timeout, - cert=self._client_cert, - ) + # By default, keep-alive is enabled in Session's request + # headers. Backends may choose to close the connection + # while a post happens which causes an unhandled + # exception. This try/except will retry the post on such exceptions + try: + resp = self._session.post( + url=self._endpoint, + data=data, + verify=self._certificate_file, + timeout=self._timeout, + cert=self._client_cert, + ) + except ConnectionError: + resp = self._session.post( + url=self._endpoint, + data=data, + verify=self._certificate_file, + timeout=self._timeout, + cert=self._client_cert, + ) + return resp @staticmethod def _retryable(resp: requests.Response) -> bool: diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py index 7bcf4b4ced1..1841e5210a4 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py @@ -21,6 +21,7 @@ from typing import Dict, Optional import requests +from requests.exceptions import ConnectionError from opentelemetry.exporter.otlp.proto.common._internal import ( _create_exp_backoff_generator, @@ -130,13 +131,27 @@ def _export(self, serialized_data: bytes): elif self._compression == Compression.Deflate: data = zlib.compress(serialized_data) - return self._session.post( - url=self._endpoint, - data=data, - verify=self._certificate_file, - timeout=self._timeout, - cert=self._client_cert, - ) + # By default, keep-alive is enabled in Session's request + # headers. Backends may choose to close the connection + # while a post happens which causes an unhandled + # exception. This try/except will retry the post on such exceptions + try: + resp = self._session.post( + url=self._endpoint, + data=data, + verify=self._certificate_file, + timeout=self._timeout, + cert=self._client_cert, + ) + except ConnectionError: + resp = self._session.post( + url=self._endpoint, + data=data, + verify=self._certificate_file, + timeout=self._timeout, + cert=self._client_cert, + ) + return resp @staticmethod def _retryable(resp: requests.Response) -> bool: From 00329e07fb01d7c3e43bb513fe9be3748745c52e Mon Sep 17 00:00:00 2001 From: DylanRussell Date: Thu, 24 Apr 2025 12:50:50 -0400 Subject: [PATCH 05/21] Refactor BatchLogRecordProcessor and associated tests (#4535) --- .../sdk/_logs/_internal/export/__init__.py | 226 ++++++---------- .../sdk/environment_variables/__init__.py | 1 + opentelemetry-sdk/tests/logs/test_export.py | 242 +++++++++--------- 3 files changed, 200 insertions(+), 269 deletions(-) diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py index a4eb113c89b..254c5f6b96d 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py @@ -22,8 +22,7 @@ import threading import weakref from os import environ, linesep -from time import time_ns -from typing import IO, Callable, Deque, List, Optional, Sequence +from typing import IO, Callable, Deque, Optional, Sequence from opentelemetry.context import ( _SUPPRESS_INSTRUMENTATION_KEY, @@ -56,6 +55,12 @@ class LogExportResult(enum.Enum): FAILURE = 1 +class BatchLogExportStrategy(enum.Enum): + EXPORT_ALL = 0 + EXPORT_WHILE_BATCH_EXCEEDS_THRESHOLD = 1 + EXPORT_AT_LEAST_ONE_BATCH = 2 + + class LogExporter(abc.ABC): """Interface for exporting logs. @@ -141,14 +146,6 @@ def force_flush(self, timeout_millis: int = 30000) -> bool: # pylint: disable=n return True -class _FlushRequest: - __slots__ = ["event", "num_log_records"] - - def __init__(self): - self.event = threading.Event() - self.num_log_records = 0 - - _BSP_RESET_ONCE = Once() @@ -167,8 +164,6 @@ class BatchLogRecordProcessor(LogRecordProcessor): """ _queue: Deque[LogData] - _flush_request: _FlushRequest | None - _log_records: List[LogData | None] def __init__( self, @@ -190,7 +185,7 @@ def __init__( max_export_batch_size = ( BatchLogRecordProcessor._default_max_export_batch_size() ) - + # Not used. No way currently to pass timeout to export. if export_timeout_millis is None: export_timeout_millis = ( BatchLogRecordProcessor._default_export_timeout_millis() @@ -202,27 +197,45 @@ def __init__( self._exporter = exporter self._max_queue_size = max_queue_size - self._schedule_delay_millis = schedule_delay_millis + self._schedule_delay = schedule_delay_millis / 1e3 self._max_export_batch_size = max_export_batch_size + # Not used. No way currently to pass timeout to export. + # TODO(https://github.com/open-telemetry/opentelemetry-python/issues/4555): figure out what this should do. self._export_timeout_millis = export_timeout_millis + # Deque is thread safe. self._queue = collections.deque([], max_queue_size) self._worker_thread = threading.Thread( name="OtelBatchLogRecordProcessor", target=self.worker, daemon=True, ) - self._condition = threading.Condition(threading.Lock()) + self._shutdown = False - self._flush_request = None - self._log_records = [None] * self._max_export_batch_size + self._export_lock = threading.Lock() + self._worker_awaken = threading.Event() self._worker_thread.start() if hasattr(os, "register_at_fork"): weak_reinit = weakref.WeakMethod(self._at_fork_reinit) os.register_at_fork(after_in_child=lambda: weak_reinit()()) # pylint: disable=unnecessary-lambda self._pid = os.getpid() + def _should_export_batch( + self, batch_strategy: BatchLogExportStrategy, num_iterations: int + ) -> bool: + if not self._queue: + return False + # Always continue to export while queue length exceeds max batch size. + if len(self._queue) >= self._max_export_batch_size: + return True + if batch_strategy is BatchLogExportStrategy.EXPORT_ALL: + return True + if batch_strategy is BatchLogExportStrategy.EXPORT_AT_LEAST_ONE_BATCH: + return num_iterations == 0 + return False + def _at_fork_reinit(self): - self._condition = threading.Condition(threading.Lock()) + self._export_lock = threading.Lock() + self._worker_awaken = threading.Event() self._queue.clear() self._worker_thread = threading.Thread( name="OtelBatchLogRecordProcessor", @@ -233,152 +246,75 @@ def _at_fork_reinit(self): self._pid = os.getpid() def worker(self): - timeout = self._schedule_delay_millis / 1e3 - flush_request: Optional[_FlushRequest] = None while not self._shutdown: - with self._condition: - if self._shutdown: - # shutdown may have been called, avoid further processing - break - flush_request = self._get_and_unset_flush_request() - if ( - len(self._queue) < self._max_export_batch_size - and flush_request is None - ): - self._condition.wait(timeout) - - flush_request = self._get_and_unset_flush_request() - if not self._queue: - timeout = self._schedule_delay_millis / 1e3 - self._notify_flush_request_finished(flush_request) - flush_request = None - continue - if self._shutdown: - break - - start_ns = time_ns() - self._export(flush_request) - end_ns = time_ns() - # subtract the duration of this export call to the next timeout - timeout = self._schedule_delay_millis / 1e3 - ( - (end_ns - start_ns) / 1e9 - ) - - self._notify_flush_request_finished(flush_request) - flush_request = None - - # there might have been a new flush request while export was running - # and before the done flag switched to true - with self._condition: - shutdown_flush_request = self._get_and_unset_flush_request() - - # flush the remaining logs - self._drain_queue() - self._notify_flush_request_finished(flush_request) - self._notify_flush_request_finished(shutdown_flush_request) - - def _export(self, flush_request: Optional[_FlushRequest] = None): - """Exports logs considering the given flush_request. - - If flush_request is not None then logs are exported in batches - until the number of exported logs reached or exceeded the num of logs in - flush_request, otherwise exports at max max_export_batch_size logs. - """ - if flush_request is None: - self._export_batch() - return - - num_log_records = flush_request.num_log_records - while self._queue: - exported = self._export_batch() - num_log_records -= exported - - if num_log_records <= 0: + # Lots of strategies in the spec for setting next timeout. + # https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/sdk.md#batching-processor. + # Shutdown will interrupt this sleep. Emit will interrupt this sleep only if the queue is bigger then threshold. + sleep_interrupted = self._worker_awaken.wait(self._schedule_delay) + if self._shutdown: break - - def _export_batch(self) -> int: - """Exports at most max_export_batch_size logs and returns the number of - exported logs. - """ - idx = 0 - while idx < self._max_export_batch_size and self._queue: - record = self._queue.pop() - self._log_records[idx] = record - idx += 1 - token = attach(set_value(_SUPPRESS_INSTRUMENTATION_KEY, True)) - try: - self._exporter.export(self._log_records[:idx]) # type: ignore - except Exception: # pylint: disable=broad-exception-caught - _logger.exception("Exception while exporting logs.") - detach(token) - - for index in range(idx): - self._log_records[index] = None - return idx - - def _drain_queue(self): - """Export all elements until queue is empty. - - Can only be called from the worker thread context because it invokes - `export` that is not thread safe. - """ - while self._queue: - self._export_batch() - - def _get_and_unset_flush_request(self) -> Optional[_FlushRequest]: - flush_request = self._flush_request - self._flush_request = None - if flush_request is not None: - flush_request.num_log_records = len(self._queue) - return flush_request - - @staticmethod - def _notify_flush_request_finished( - flush_request: Optional[_FlushRequest] = None, - ): - if flush_request is not None: - flush_request.event.set() - - def _get_or_create_flush_request(self) -> _FlushRequest: - if self._flush_request is None: - self._flush_request = _FlushRequest() - return self._flush_request + self._export( + BatchLogExportStrategy.EXPORT_WHILE_BATCH_EXCEEDS_THRESHOLD + if sleep_interrupted + else BatchLogExportStrategy.EXPORT_AT_LEAST_ONE_BATCH + ) + self._worker_awaken.clear() + self._export(BatchLogExportStrategy.EXPORT_ALL) + + def _export(self, batch_strategy: BatchLogExportStrategy) -> None: + with self._export_lock: + iteration = 0 + # We could see concurrent export calls from worker and force_flush. We call _should_export_batch + # once the lock is obtained to see if we still need to make the requested export. + while self._should_export_batch(batch_strategy, iteration): + iteration += 1 + token = attach(set_value(_SUPPRESS_INSTRUMENTATION_KEY, True)) + try: + self._exporter.export( + [ + # Oldest records are at the back, so pop from there. + self._queue.pop() + for _ in range( + min( + self._max_export_batch_size, + len(self._queue), + ) + ) + ] + ) + except Exception: # pylint: disable=broad-exception-caught + _logger.exception("Exception while exporting logs.") + detach(token) def emit(self, log_data: LogData) -> None: - """Adds the `LogData` to queue and notifies the waiting threads - when size of queue reaches max_export_batch_size. - """ if self._shutdown: + _logger.info("Shutdown called, ignoring log.") return if self._pid != os.getpid(): _BSP_RESET_ONCE.do_once(self._at_fork_reinit) + if len(self._queue) == self._max_queue_size: + _logger.warning("Queue full, dropping log.") self._queue.appendleft(log_data) if len(self._queue) >= self._max_export_batch_size: - with self._condition: - self._condition.notify() + self._worker_awaken.set() def shutdown(self): + if self._shutdown: + return + # Prevents emit and force_flush from further calling export. self._shutdown = True - with self._condition: - self._condition.notify_all() + # Interrupts sleep in the worker, if it's sleeping. + self._worker_awaken.set() + # Main worker loop should exit after one final export call with flush all strategy. self._worker_thread.join() self._exporter.shutdown() def force_flush(self, timeout_millis: Optional[int] = None) -> bool: - if timeout_millis is None: - timeout_millis = self._export_timeout_millis if self._shutdown: - return True - - with self._condition: - flush_request = self._get_or_create_flush_request() - self._condition.notify_all() - - ret = flush_request.event.wait(timeout_millis / 1e3) - if not ret: - _logger.warning("Timeout was exceeded in force_flush().") - return ret + return + # Blocking call to export. + self._export(BatchLogExportStrategy.EXPORT_ALL) @staticmethod def _default_max_queue_size(): diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/environment_variables/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/environment_variables/__init__.py index 4f69143084c..23b634fcd85 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/environment_variables/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/environment_variables/__init__.py @@ -87,6 +87,7 @@ .. envvar:: OTEL_BLRP_EXPORT_TIMEOUT The :envvar:`OTEL_BLRP_EXPORT_TIMEOUT` represents the maximum allowed time to export data from the BatchLogRecordProcessor. +This environment variable currently does nothing, see https://github.com/open-telemetry/opentelemetry-python/issues/4555. Default: 30000 """ diff --git a/opentelemetry-sdk/tests/logs/test_export.py b/opentelemetry-sdk/tests/logs/test_export.py index b9ec0ac2e7f..6511b137a92 100644 --- a/opentelemetry-sdk/tests/logs/test_export.py +++ b/opentelemetry-sdk/tests/logs/test_export.py @@ -50,6 +50,11 @@ from opentelemetry.trace import TraceFlags from opentelemetry.trace.span import INVALID_SPAN_CONTEXT +EMPTY_LOG = LogData( + log_record=LogRecord(), + instrumentation_scope=InstrumentationScope("example", "example"), +) + class TestSimpleLogRecordProcessor(unittest.TestCase): def test_simple_log_record_processor_default_level(self): @@ -328,7 +333,7 @@ def test_simple_log_record_processor_different_msg_types_with_formatter( self.assertEqual(expected, emitted) -class TestBatchLogRecordProcessor(ConcurrencyTestBase): +class TestBatchLogRecordProcessor(unittest.TestCase): def test_emit_call_log_record(self): exporter = InMemoryLogExporter() log_record_processor = Mock(wraps=BatchLogRecordProcessor(exporter)) @@ -353,7 +358,7 @@ def test_args(self): ) self.assertEqual(log_record_processor._exporter, exporter) self.assertEqual(log_record_processor._max_queue_size, 1024) - self.assertEqual(log_record_processor._schedule_delay_millis, 2500) + self.assertEqual(log_record_processor._schedule_delay, 2.5) self.assertEqual(log_record_processor._max_export_batch_size, 256) self.assertEqual(log_record_processor._export_timeout_millis, 15000) @@ -371,7 +376,7 @@ def test_env_vars(self): log_record_processor = BatchLogRecordProcessor(exporter) self.assertEqual(log_record_processor._exporter, exporter) self.assertEqual(log_record_processor._max_queue_size, 1024) - self.assertEqual(log_record_processor._schedule_delay_millis, 2500) + self.assertEqual(log_record_processor._schedule_delay, 2.5) self.assertEqual(log_record_processor._max_export_batch_size, 256) self.assertEqual(log_record_processor._export_timeout_millis, 15000) @@ -380,7 +385,7 @@ def test_args_defaults(self): log_record_processor = BatchLogRecordProcessor(exporter) self.assertEqual(log_record_processor._exporter, exporter) self.assertEqual(log_record_processor._max_queue_size, 2048) - self.assertEqual(log_record_processor._schedule_delay_millis, 5000) + self.assertEqual(log_record_processor._schedule_delay, 5) self.assertEqual(log_record_processor._max_export_batch_size, 512) self.assertEqual(log_record_processor._export_timeout_millis, 30000) @@ -400,7 +405,7 @@ def test_args_env_var_value_error(self): _logger.disabled = False self.assertEqual(log_record_processor._exporter, exporter) self.assertEqual(log_record_processor._max_queue_size, 2048) - self.assertEqual(log_record_processor._schedule_delay_millis, 5000) + self.assertEqual(log_record_processor._schedule_delay, 5) self.assertEqual(log_record_processor._max_export_batch_size, 512) self.assertEqual(log_record_processor._export_timeout_millis, 30000) @@ -415,7 +420,7 @@ def test_args_none_defaults(self): ) self.assertEqual(log_record_processor._exporter, exporter) self.assertEqual(log_record_processor._max_queue_size, 2048) - self.assertEqual(log_record_processor._schedule_delay_millis, 5000) + self.assertEqual(log_record_processor._schedule_delay, 5) self.assertEqual(log_record_processor._max_export_batch_size, 512) self.assertEqual(log_record_processor._export_timeout_millis, 30000) @@ -465,161 +470,155 @@ def test_validation_negative_max_queue_size(self): max_export_batch_size=101, ) - def test_shutdown(self): - exporter = InMemoryLogExporter() - log_record_processor = BatchLogRecordProcessor(exporter) - - provider = LoggerProvider() - provider.add_log_record_processor(log_record_processor) - - logger = logging.getLogger("shutdown") - logger.addHandler(LoggingHandler(logger_provider=provider)) - - with self.assertLogs(level=logging.WARNING): - logger.warning("warning message: %s", "possible upcoming heatwave") - with self.assertLogs(level=logging.WARNING): - logger.error("Very high rise in temperatures across the globe") - with self.assertLogs(level=logging.WARNING): - logger.critical("Temperature hits high 420 C in Hyderabad") + def test_logs_exported_once_batch_size_reached(self): + exporter = Mock() + log_record_processor = BatchLogRecordProcessor( + exporter=exporter, + max_queue_size=15, + max_export_batch_size=15, + # Will not reach this during the test, this sleep should be interrupted when batch size is reached. + schedule_delay_millis=30000, + ) + before_export = time.time_ns() + for _ in range(15): + log_record_processor.emit(EMPTY_LOG) + # Wait a bit for the worker thread to wake up and call export. + time.sleep(0.1) + exporter.export.assert_called_once() + after_export = time.time_ns() + # Shows the worker's 30 second sleep was interrupted within a second. + self.assertLess(after_export - before_export, 1e9) + + # pylint: disable=no-self-use + def test_logs_exported_once_schedule_delay_reached(self): + exporter = Mock() + log_record_processor = BatchLogRecordProcessor( + exporter=exporter, + max_queue_size=15, + max_export_batch_size=15, + schedule_delay_millis=100, + ) + log_record_processor.emit(EMPTY_LOG) + time.sleep(0.2) + exporter.export.assert_called_once_with([EMPTY_LOG]) + def test_logs_flushed_before_shutdown_and_dropped_after_shutdown(self): + exporter = Mock() + log_record_processor = BatchLogRecordProcessor( + exporter=exporter, + # Neither of these thresholds should be hit before test ends. + max_queue_size=15, + max_export_batch_size=15, + schedule_delay_millis=30000, + ) + # This log should be flushed because it was written before shutdown. + log_record_processor.emit(EMPTY_LOG) log_record_processor.shutdown() + exporter.export.assert_called_once_with([EMPTY_LOG]) self.assertTrue(exporter._stopped) - finished_logs = exporter.get_finished_logs() - expected = [ - ("warning message: possible upcoming heatwave", "WARN"), - ("Very high rise in temperatures across the globe", "ERROR"), - ( - "Temperature hits high 420 C in Hyderabad", - "CRITICAL", - ), - ] - emitted = [ - (item.log_record.body, item.log_record.severity_text) - for item in finished_logs - ] - self.assertEqual(expected, emitted) - for item in finished_logs: - self.assertEqual(item.instrumentation_scope.name, "shutdown") - - def test_force_flush(self): - exporter = InMemoryLogExporter() - log_record_processor = BatchLogRecordProcessor(exporter) - - provider = LoggerProvider() - provider.add_log_record_processor(log_record_processor) - - logger = logging.getLogger("force_flush") - logger.propagate = False - logger.addHandler(LoggingHandler(logger_provider=provider)) - - logger.critical("Earth is burning") - log_record_processor.force_flush() - finished_logs = exporter.get_finished_logs() - self.assertEqual(len(finished_logs), 1) - log_record = finished_logs[0].log_record - self.assertEqual(log_record.body, "Earth is burning") - self.assertEqual(log_record.severity_number, SeverityNumber.FATAL) - self.assertEqual( - finished_logs[0].instrumentation_scope.name, "force_flush" + with self.assertLogs(level="INFO") as log: + # This log should not be flushed. + log_record_processor.emit(EMPTY_LOG) + self.assertEqual(len(log.output), 1) + self.assertEqual(len(log.records), 1) + self.assertIn("Shutdown called, ignoring log.", log.output[0]) + exporter.export.assert_called_once() + + # pylint: disable=no-self-use + def test_force_flush_flushes_logs(self): + exporter = Mock() + log_record_processor = BatchLogRecordProcessor( + exporter=exporter, + # Neither of these thresholds should be hit before test ends. + max_queue_size=15, + max_export_batch_size=15, + schedule_delay_millis=30000, ) - - def test_log_record_processor_too_many_logs(self): - exporter = InMemoryLogExporter() - log_record_processor = BatchLogRecordProcessor(exporter) - - provider = LoggerProvider() - provider.add_log_record_processor(log_record_processor) - - logger = logging.getLogger("many_logs") - logger.propagate = False - logger.addHandler(LoggingHandler(logger_provider=provider)) - - for log_no in range(1000): - logger.critical("Log no: %s", log_no) - - self.assertTrue(log_record_processor.force_flush()) - finised_logs = exporter.get_finished_logs() - self.assertEqual(len(finised_logs), 1000) - for item in finised_logs: - self.assertEqual(item.instrumentation_scope.name, "many_logs") + for _ in range(10): + log_record_processor.emit(EMPTY_LOG) + log_record_processor.force_flush() + exporter.export.assert_called_once_with([EMPTY_LOG for _ in range(10)]) def test_with_multiple_threads(self): exporter = InMemoryLogExporter() log_record_processor = BatchLogRecordProcessor(exporter) - provider = LoggerProvider() - provider.add_log_record_processor(log_record_processor) - - logger = logging.getLogger("threads") - logger.propagate = False - logger.addHandler(LoggingHandler(logger_provider=provider)) - def bulk_log_and_flush(num_logs): for _ in range(num_logs): - logger.critical("Critical message") - self.assertTrue(log_record_processor.force_flush()) + log_record_processor.emit(EMPTY_LOG) + log_record_processor.force_flush() with ThreadPoolExecutor(max_workers=69) as executor: - futures = [] for idx in range(69): - future = executor.submit(bulk_log_and_flush, idx + 1) - futures.append(future) + executor.submit(bulk_log_and_flush, idx + 1) executor.shutdown() finished_logs = exporter.get_finished_logs() self.assertEqual(len(finished_logs), 2415) - for item in finished_logs: - self.assertEqual(item.instrumentation_scope.name, "threads") @unittest.skipUnless( hasattr(os, "fork"), "needs *nix", ) - def test_batch_log_record_processor_fork(self): - # pylint: disable=invalid-name + def test_batch_log_record_processor_fork_clears_logs_from_child(self): exporter = InMemoryLogExporter() log_record_processor = BatchLogRecordProcessor( exporter, max_export_batch_size=64, - schedule_delay_millis=10, + schedule_delay_millis=30000, ) - provider = LoggerProvider() - provider.add_log_record_processor(log_record_processor) + # These logs should be flushed only from the parent process. + # _at_fork_reinit should be called in the child process, to + # clear these logs in the child process. + for _ in range(10): + log_record_processor.emit(EMPTY_LOG) - logger = logging.getLogger("test-fork") - logger.propagate = False - logger.addHandler(LoggingHandler(logger_provider=provider)) + # The below test also needs this, but it can only be set once. + multiprocessing.set_start_method("fork") - logger.critical("yolo") - time.sleep(0.5) # give some time for the exporter to upload + def child(conn): + log_record_processor.force_flush() + logs = exporter.get_finished_logs() + conn.send(len(logs) == 0) + conn.close() - self.assertTrue(log_record_processor.force_flush()) - self.assertEqual(len(exporter.get_finished_logs()), 1) - exporter.clear() + parent_conn, child_conn = multiprocessing.Pipe() + process = multiprocessing.Process(target=child, args=(child_conn,)) + process.start() + self.assertTrue(parent_conn.recv()) + process.join() + log_record_processor.force_flush() + self.assertTrue(len(exporter.get_finished_logs()) == 10) - multiprocessing.set_start_method("fork") + @unittest.skipUnless( + hasattr(os, "fork"), + "needs *nix", + ) + def test_batch_log_record_processor_fork_doesnot_deadlock(self): + exporter = InMemoryLogExporter() + log_record_processor = BatchLogRecordProcessor( + exporter, + max_export_batch_size=64, + schedule_delay_millis=30000, + ) def child(conn): def _target(): - logger.critical("Critical message child") - - self.run_with_many_threads(_target, 100) - - time.sleep(0.5) + log_record_processor.emit(EMPTY_LOG) + ConcurrencyTestBase.run_with_many_threads(_target, 100) + log_record_processor.force_flush() logs = exporter.get_finished_logs() conn.send(len(logs) == 100) conn.close() parent_conn, child_conn = multiprocessing.Pipe() - p = multiprocessing.Process(target=child, args=(child_conn,)) - p.start() + process = multiprocessing.Process(target=child, args=(child_conn,)) + process.start() self.assertTrue(parent_conn.recv()) - p.join() - - log_record_processor.shutdown() + process.join() def test_batch_log_record_processor_gc(self): # Given a BatchLogRecordProcessor @@ -680,11 +679,6 @@ def formatter(record): # pylint: disable=unused-argument mock_stdout = Mock() exporter = ConsoleLogExporter(out=mock_stdout, formatter=formatter) - log_data = LogData( - log_record=LogRecord(), - instrumentation_scope=InstrumentationScope( - "first_name", "first_version" - ), - ) - exporter.export([log_data]) + exporter.export([EMPTY_LOG]) + mock_stdout.write.assert_called_once_with(mock_record_str) From e46db882882a6cbf8c43ef6bb7050510514e81ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Em=C3=ADdio=20Neto?= <9735060+emdneto@users.noreply.github.com> Date: Mon, 28 Apr 2025 11:54:15 -0300 Subject: [PATCH 06/21] infra: Automate SHA procedure during releases (#4547) * trying sha-automation Signed-off-by: emdneto <9735060+emdneto@users.noreply.github.com> * fix label names * fix sha-automation core Signed-off-by: emdneto <9735060+emdneto@users.noreply.github.com> * add new line Signed-off-by: emdneto <9735060+emdneto@users.noreply.github.com> --------- Signed-off-by: emdneto <9735060+emdneto@users.noreply.github.com> --- .github/workflows/contrib.yml | 6 ++++- .github/workflows/lint_0.yml | 10 +++++++- .github/workflows/misc_0.yml | 10 +++++++- .github/workflows/prepare-patch-release.yml | 13 ++++++++-- .github/workflows/prepare-release-branch.yml | 26 +++++++++++++++++--- .github/workflows/templates/lint.yml.j2 | 10 +++++++- .github/workflows/templates/misc.yml.j2 | 10 +++++++- .github/workflows/templates/test.yml.j2 | 10 +++++++- .github/workflows/test_0.yml | 10 +++++++- .github/workflows/test_1.yml | 10 +++++++- 10 files changed, 101 insertions(+), 14 deletions(-) diff --git a/.github/workflows/contrib.yml b/.github/workflows/contrib.yml index d59a452239b..395f3b31a49 100644 --- a/.github/workflows/contrib.yml +++ b/.github/workflows/contrib.yml @@ -15,4 +15,8 @@ jobs: uses: open-telemetry/opentelemetry-python-contrib/.github/workflows/core_contrib_test_0.yml@main with: CORE_REPO_SHA: ${{ github.sha }} - CONTRIB_REPO_SHA: main + CONTRIB_REPO_SHA: ${{ github.event_name == 'pull_request' && ( + contains(github.event.pull_request.labels.*.name, 'prepare-release') && github.event.pull_request.head.ref || + contains(github.event.pull_request.labels.*.name, 'backport') && github.event.pull_request.base.ref || + 'main' + ) || 'main' }} diff --git a/.github/workflows/lint_0.yml b/.github/workflows/lint_0.yml index e06b0b65fce..77320068972 100644 --- a/.github/workflows/lint_0.yml +++ b/.github/workflows/lint_0.yml @@ -15,7 +15,15 @@ concurrency: env: CORE_REPO_SHA: main - CONTRIB_REPO_SHA: main + # Set the SHA to the branch name if the PR has a label 'prepare-release' or 'backport' otherwise, set it to 'main' + # For PRs you can change the inner fallback ('main') + # For pushes you change the outer fallback ('main') + # The logic below is used during releases and depends on having an equivalent branch name in the contrib repo. + CONTRIB_REPO_SHA: ${{ github.event_name == 'pull_request' && ( + contains(github.event.pull_request.labels.*.name, 'prepare-release') && github.event.pull_request.head.ref || + contains(github.event.pull_request.labels.*.name, 'backport') && github.event.pull_request.base.ref || + 'main' + ) || 'main' }} PIP_EXISTS_ACTION: w jobs: diff --git a/.github/workflows/misc_0.yml b/.github/workflows/misc_0.yml index 0b7999d3bd6..1497bbe8c45 100644 --- a/.github/workflows/misc_0.yml +++ b/.github/workflows/misc_0.yml @@ -15,7 +15,15 @@ concurrency: env: CORE_REPO_SHA: main - CONTRIB_REPO_SHA: main + # Set the SHA to the branch name if the PR has a label 'prepare-release' or 'backport' otherwise, set it to 'main' + # For PRs you can change the inner fallback ('main') + # For pushes you change the outer fallback ('main') + # The logic below is used during releases and depends on having an equivalent branch name in the contrib repo. + CONTRIB_REPO_SHA: ${{ github.event_name == 'pull_request' && ( + contains(github.event.pull_request.labels.*.name, 'prepare-release') && github.event.pull_request.head.ref || + contains(github.event.pull_request.labels.*.name, 'backport') && github.event.pull_request.base.ref || + 'main' + ) || 'main' }} PIP_EXISTS_ACTION: w jobs: diff --git a/.github/workflows/prepare-patch-release.yml b/.github/workflows/prepare-patch-release.yml index e37b78afae3..680b3842b99 100644 --- a/.github/workflows/prepare-patch-release.yml +++ b/.github/workflows/prepare-patch-release.yml @@ -65,6 +65,7 @@ jobs: run: .github/scripts/use-cla-approved-github-bot.sh - name: Create pull request + id: create_pr env: # not using secrets.GITHUB_TOKEN since pull requests from that token do not run workflows GITHUB_TOKEN: ${{ secrets.OPENTELEMETRYBOT_GITHUB_TOKEN }} @@ -74,7 +75,15 @@ jobs: git commit -a -m "$message" git push origin HEAD:$branch - gh pr create --title "[$GITHUB_REF_NAME] $message" \ + pr_url=$(gh pr create --title "[$GITHUB_REF_NAME] $message" \ --body "$message." \ --head $branch \ - --base $GITHUB_REF_NAME + --base $GITHUB_REF_NAME) + + echo "pr_url=$pr_url" >> $GITHUB_OUTPUT + - name: Add prepare-release label to PR + if: steps.create_pr.outputs.pr_url != '' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh pr edit ${{ steps.create_pr.outputs.pr_url }} --add-label "prepare-release" diff --git a/.github/workflows/prepare-release-branch.yml b/.github/workflows/prepare-release-branch.yml index 18bad26bfbb..edb906ed16c 100644 --- a/.github/workflows/prepare-release-branch.yml +++ b/.github/workflows/prepare-release-branch.yml @@ -91,6 +91,7 @@ jobs: run: .github/scripts/use-cla-approved-github-bot.sh - name: Create pull request against the release branch + id: create_release_branch_pr env: # not using secrets.GITHUB_TOKEN since pull requests from that token do not run workflows GITHUB_TOKEN: ${{ secrets.OPENTELEMETRYBOT_GITHUB_TOKEN }} @@ -100,10 +101,18 @@ jobs: git commit -a -m "$message" git push origin HEAD:$branch - gh pr create --title "[$RELEASE_BRANCH_NAME] $message" \ + pr_url=$(gh pr create --title "[$RELEASE_BRANCH_NAME] $message" \ --body "$message." \ --head $branch \ - --base $RELEASE_BRANCH_NAME + --base $RELEASE_BRANCH_NAME) + echo "pr_url=$pr_url" >> $GITHUB_OUTPUT + + - name: Add prepare-release label to PR + if: steps.create_release_branch_pr.outputs.pr_url != '' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh pr edit ${{ steps.create_release_branch_pr.outputs.pr_url }} --add-label "prepare-release" create-pull-request-against-main: runs-on: ubuntu-latest @@ -170,6 +179,7 @@ jobs: run: .github/scripts/use-cla-approved-github-bot.sh - name: Create pull request against main + id: create_main_pr env: # not using secrets.GITHUB_TOKEN since pull requests from that token do not run workflows GITHUB_TOKEN: ${{ secrets.OPENTELEMETRYBOT_GITHUB_TOKEN }} @@ -180,7 +190,15 @@ jobs: git commit -a -m "$message" git push origin HEAD:$branch - gh pr create --title "$message" \ + pr_url=$(gh pr create --title "$message" \ --body "$body" \ --head $branch \ - --base main + --base main) + echo "pr_url=$pr_url" >> $GITHUB_OUTPUT + + - name: Add prepare-release label to PR + if: steps.create_main_pr.outputs.pr_url != '' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh pr edit ${{ steps.create_main_pr.outputs.pr_url }} --add-label "prepare-release" diff --git a/.github/workflows/templates/lint.yml.j2 b/.github/workflows/templates/lint.yml.j2 index e373be8d69e..169f8f61dc4 100644 --- a/.github/workflows/templates/lint.yml.j2 +++ b/.github/workflows/templates/lint.yml.j2 @@ -15,7 +15,15 @@ concurrency: env: CORE_REPO_SHA: main - CONTRIB_REPO_SHA: main + # Set the SHA to the branch name if the PR has a label 'prepare-release' or 'backport' otherwise, set it to 'main' + # For PRs you can change the inner fallback ('main') + # For pushes you change the outer fallback ('main') + # The logic below is used during releases and depends on having an equivalent branch name in the contrib repo. + CONTRIB_REPO_SHA: {% raw %}${{ github.event_name == 'pull_request' && ( + contains(github.event.pull_request.labels.*.name, 'prepare-release') && github.event.pull_request.head.ref || + contains(github.event.pull_request.labels.*.name, 'backport') && github.event.pull_request.base.ref || + 'main' + ) || 'main' }}{% endraw %} PIP_EXISTS_ACTION: w jobs: diff --git a/.github/workflows/templates/misc.yml.j2 b/.github/workflows/templates/misc.yml.j2 index 1cd3c27a42d..d2f5fed3b83 100644 --- a/.github/workflows/templates/misc.yml.j2 +++ b/.github/workflows/templates/misc.yml.j2 @@ -15,7 +15,15 @@ concurrency: env: CORE_REPO_SHA: main - CONTRIB_REPO_SHA: main + # Set the SHA to the branch name if the PR has a label 'prepare-release' or 'backport' otherwise, set it to 'main' + # For PRs you can change the inner fallback ('main') + # For pushes you change the outer fallback ('main') + # The logic below is used during releases and depends on having an equivalent branch name in the contrib repo. + CONTRIB_REPO_SHA: {% raw %}${{ github.event_name == 'pull_request' && ( + contains(github.event.pull_request.labels.*.name, 'prepare-release') && github.event.pull_request.head.ref || + contains(github.event.pull_request.labels.*.name, 'backport') && github.event.pull_request.base.ref || + 'main' + ) || 'main' }}{% endraw %} PIP_EXISTS_ACTION: w jobs: diff --git a/.github/workflows/templates/test.yml.j2 b/.github/workflows/templates/test.yml.j2 index efd9e311224..8e8338b6237 100644 --- a/.github/workflows/templates/test.yml.j2 +++ b/.github/workflows/templates/test.yml.j2 @@ -15,7 +15,15 @@ concurrency: env: CORE_REPO_SHA: main - CONTRIB_REPO_SHA: main + # Set the SHA to the branch name if the PR has a label 'prepare-release' or 'backport' otherwise, set it to 'main' + # For PRs you can change the inner fallback ('main') + # For pushes you change the outer fallback ('main') + # The logic below is used during releases and depends on having an equivalent branch name in the contrib repo. + CONTRIB_REPO_SHA: {% raw %}${{ github.event_name == 'pull_request' && ( + contains(github.event.pull_request.labels.*.name, 'prepare-release') && github.event.pull_request.head.ref || + contains(github.event.pull_request.labels.*.name, 'backport') && github.event.pull_request.base.ref || + 'main' + ) || 'main' }}{% endraw %} PIP_EXISTS_ACTION: w jobs: diff --git a/.github/workflows/test_0.yml b/.github/workflows/test_0.yml index 2b33a23c476..dcfec0fa0a7 100644 --- a/.github/workflows/test_0.yml +++ b/.github/workflows/test_0.yml @@ -15,7 +15,15 @@ concurrency: env: CORE_REPO_SHA: main - CONTRIB_REPO_SHA: main + # Set the SHA to the branch name if the PR has a label 'prepare-release' or 'backport' otherwise, set it to 'main' + # For PRs you can change the inner fallback ('main') + # For pushes you change the outer fallback ('main') + # The logic below is used during releases and depends on having an equivalent branch name in the contrib repo. + CONTRIB_REPO_SHA: ${{ github.event_name == 'pull_request' && ( + contains(github.event.pull_request.labels.*.name, 'prepare-release') && github.event.pull_request.head.ref || + contains(github.event.pull_request.labels.*.name, 'backport') && github.event.pull_request.base.ref || + 'main' + ) || 'main' }} PIP_EXISTS_ACTION: w jobs: diff --git a/.github/workflows/test_1.yml b/.github/workflows/test_1.yml index de5a446d3f6..b3d5a75ff4a 100644 --- a/.github/workflows/test_1.yml +++ b/.github/workflows/test_1.yml @@ -15,7 +15,15 @@ concurrency: env: CORE_REPO_SHA: main - CONTRIB_REPO_SHA: main + # Set the SHA to the branch name if the PR has a label 'prepare-release' or 'backport' otherwise, set it to 'main' + # For PRs you can change the inner fallback ('main') + # For pushes you change the outer fallback ('main') + # The logic below is used during releases and depends on having an equivalent branch name in the contrib repo. + CONTRIB_REPO_SHA: ${{ github.event_name == 'pull_request' && ( + contains(github.event.pull_request.labels.*.name, 'prepare-release') && github.event.pull_request.head.ref || + contains(github.event.pull_request.labels.*.name, 'backport') && github.event.pull_request.base.ref || + 'main' + ) || 'main' }} PIP_EXISTS_ACTION: w jobs: From 561f347695ab6a195c38ea8a2834539d225eac46 Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Mon, 28 Apr 2025 19:23:56 +0000 Subject: [PATCH 07/21] Add timeout millis param to export. --- .../tests/test_otlp_trace_exporter.py | 2 +- .../sdk/_logs/_internal/export/__init__.py | 15 +++++++++++++-- .../_internal/export/in_memory_log_exporter.py | 1 + .../opentelemetry/sdk/trace/export/__init__.py | 11 +++++++++-- 4 files changed, 24 insertions(+), 5 deletions(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py index ea39a7792d4..5238dc91224 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py @@ -16,7 +16,7 @@ import os from unittest import TestCase -from unittest.mock import Mock, PropertyMock, patch, ANY +from unittest.mock import ANY, Mock, PropertyMock, patch from grpc import ChannelCredentials, Compression diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py index a4eb113c89b..5e40aca3009 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py @@ -67,11 +67,14 @@ class LogExporter(abc.ABC): """ @abc.abstractmethod - def export(self, batch: Sequence[LogData]): + def export( + self, batch: Sequence[LogData], timeout_millis: Optional[int] = None + ): """Exports a batch of logs. Args: - batch: The list of `LogData` objects to be exported + batch: The list of `LogData` objects to be exported. + timeout_millis: Optional milliseconds until Export should timeout if it hasn't succeded. Returns: The result of the export @@ -84,6 +87,13 @@ def shutdown(self): Called when the SDK is shut down. """ + @abc.abstractmethod + def force_flush(self, timeout_millis: int = 30000) -> bool: + """Hint to ensure that the export of any spans the exporter has received + prior to the call to ForceFlush SHOULD be completed as soon as possible, preferably + before returning from this method. + """ + class ConsoleLogExporter(LogExporter): """Implementation of :class:`LogExporter` that prints log records to the @@ -102,6 +112,7 @@ def __init__( self.out = out self.formatter = formatter + # pylint: disable=arguments-differ def export(self, batch: Sequence[LogData]): for data in batch: self.out.write(self.formatter(data.log_record)) diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/in_memory_log_exporter.py b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/in_memory_log_exporter.py index 68cb6b7389a..910e2cb17c2 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/in_memory_log_exporter.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/in_memory_log_exporter.py @@ -40,6 +40,7 @@ def get_finished_logs(self) -> typing.Tuple[LogData, ...]: with self._lock: return tuple(self._logs) + # pylint: disable=arguments-differ def export(self, batch: typing.Sequence[LogData]) -> LogExportResult: if self._stopped: return LogExportResult.FAILURE diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/__init__.py index 9e60d6cff9b..006d8038375 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/__init__.py @@ -13,6 +13,7 @@ # limitations under the License. from __future__ import annotations +import abc import collections import logging import os @@ -56,7 +57,7 @@ class SpanExportResult(Enum): FAILURE = 1 -class SpanExporter: +class SpanExporter(abc.ABC): """Interface for exporting spans. Interface to be implemented by services that want to export spans recorded @@ -66,24 +67,30 @@ class SpanExporter: `SimpleSpanProcessor` or a `BatchSpanProcessor`. """ + @abc.abstractmethod def export( - self, spans: typing.Sequence[ReadableSpan] + self, + spans: typing.Sequence[ReadableSpan], + timeout_millis: typing.Optional[int] = None, ) -> "SpanExportResult": """Exports a batch of telemetry data. Args: spans: The list of `opentelemetry.trace.Span` objects to be exported + timeout_millis: Optional milliseconds until Export should timeout if it hasn't succeded. Returns: The result of the export """ + @abc.abstractmethod def shutdown(self) -> None: """Shuts down the exporter. Called when the SDK is shut down. """ + @abc.abstractmethod def force_flush(self, timeout_millis: int = 30000) -> bool: """Hint to ensure that the export of any spans the exporter has received prior to the call to ForceFlush SHOULD be completed as soon as possible, preferably From 8269f54296448de2d558548ca30039acdcd1e2e7 Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Wed, 9 Apr 2025 18:35:31 +0000 Subject: [PATCH 08/21] Refactor BatchLogRecordProcessor --- .../sdk/_logs/_internal/export/__init__.py | 225 ++++++------------ opentelemetry-sdk/tests/logs/test_export.py | 225 ++++++++---------- 2 files changed, 173 insertions(+), 277 deletions(-) diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py index 5e40aca3009..39452d4cbc1 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py @@ -22,8 +22,7 @@ import threading import weakref from os import environ, linesep -from time import time_ns -from typing import IO, Callable, Deque, List, Optional, Sequence +from typing import IO, Callable, Deque, Optional, Sequence from opentelemetry.context import ( _SUPPRESS_INSTRUMENTATION_KEY, @@ -56,6 +55,12 @@ class LogExportResult(enum.Enum): FAILURE = 1 +class BatchLogExportStrategy(enum.Enum): + EXPORT_ALL = 0 + EXPORT_WHILE_BATCH_EXCEEDS_THRESHOLD = 1 + EXPORT_AT_LEAST_ONE_BATCH = 2 + + class LogExporter(abc.ABC): """Interface for exporting logs. @@ -152,14 +157,6 @@ def force_flush(self, timeout_millis: int = 30000) -> bool: # pylint: disable=n return True -class _FlushRequest: - __slots__ = ["event", "num_log_records"] - - def __init__(self): - self.event = threading.Event() - self.num_log_records = 0 - - _BSP_RESET_ONCE = Once() @@ -178,8 +175,6 @@ class BatchLogRecordProcessor(LogRecordProcessor): """ _queue: Deque[LogData] - _flush_request: _FlushRequest | None - _log_records: List[LogData | None] def __init__( self, @@ -201,7 +196,7 @@ def __init__( max_export_batch_size = ( BatchLogRecordProcessor._default_max_export_batch_size() ) - + # Not used. No way currently to pass timeout to export. if export_timeout_millis is None: export_timeout_millis = ( BatchLogRecordProcessor._default_export_timeout_millis() @@ -210,30 +205,45 @@ def __init__( BatchLogRecordProcessor._validate_arguments( max_queue_size, schedule_delay_millis, max_export_batch_size ) - self._exporter = exporter self._max_queue_size = max_queue_size - self._schedule_delay_millis = schedule_delay_millis + self._schedule_delay = schedule_delay_millis / 1e3 self._max_export_batch_size = max_export_batch_size + # Not used. No way currently to pass timeout to export. self._export_timeout_millis = export_timeout_millis + # Deque is thread safe. self._queue = collections.deque([], max_queue_size) self._worker_thread = threading.Thread( name="OtelBatchLogRecordProcessor", target=self.worker, daemon=True, ) - self._condition = threading.Condition(threading.Lock()) self._shutdown = False - self._flush_request = None - self._log_records = [None] * self._max_export_batch_size + self._export_lock = threading.Lock() + self._worker_sleep = threading.Event() self._worker_thread.start() if hasattr(os, "register_at_fork"): weak_reinit = weakref.WeakMethod(self._at_fork_reinit) os.register_at_fork(after_in_child=lambda: weak_reinit()()) # pylint: disable=unnecessary-lambda self._pid = os.getpid() + def _should_export_batch( + self, batch_strategy: BatchLogExportStrategy, num_iterations: int + ) -> bool: + if not self._queue: + return False + # Always continue to export while queue length exceeds max batch size. + if len(self._queue) >= self._max_export_batch_size: + return True + if batch_strategy == BatchLogExportStrategy.EXPORT_ALL: + return True + if batch_strategy == BatchLogExportStrategy.EXPORT_AT_LEAST_ONE_BATCH: + return num_iterations == 0 + return False + def _at_fork_reinit(self): - self._condition = threading.Condition(threading.Lock()) + self._export_lock = threading.Lock() + self._worker_sleep = threading.Event() self._queue.clear() self._worker_thread = threading.Thread( name="OtelBatchLogRecordProcessor", @@ -244,152 +254,75 @@ def _at_fork_reinit(self): self._pid = os.getpid() def worker(self): - timeout = self._schedule_delay_millis / 1e3 - flush_request: Optional[_FlushRequest] = None while not self._shutdown: - with self._condition: - if self._shutdown: - # shutdown may have been called, avoid further processing - break - flush_request = self._get_and_unset_flush_request() - if ( - len(self._queue) < self._max_export_batch_size - and flush_request is None - ): - self._condition.wait(timeout) - - flush_request = self._get_and_unset_flush_request() - if not self._queue: - timeout = self._schedule_delay_millis / 1e3 - self._notify_flush_request_finished(flush_request) - flush_request = None - continue - if self._shutdown: - break - - start_ns = time_ns() - self._export(flush_request) - end_ns = time_ns() - # subtract the duration of this export call to the next timeout - timeout = self._schedule_delay_millis / 1e3 - ( - (end_ns - start_ns) / 1e9 - ) - - self._notify_flush_request_finished(flush_request) - flush_request = None - - # there might have been a new flush request while export was running - # and before the done flag switched to true - with self._condition: - shutdown_flush_request = self._get_and_unset_flush_request() - - # flush the remaining logs - self._drain_queue() - self._notify_flush_request_finished(flush_request) - self._notify_flush_request_finished(shutdown_flush_request) - - def _export(self, flush_request: Optional[_FlushRequest] = None): - """Exports logs considering the given flush_request. - - If flush_request is not None then logs are exported in batches - until the number of exported logs reached or exceeded the num of logs in - flush_request, otherwise exports at max max_export_batch_size logs. - """ - if flush_request is None: - self._export_batch() - return - - num_log_records = flush_request.num_log_records - while self._queue: - exported = self._export_batch() - num_log_records -= exported - - if num_log_records <= 0: + # Lots of strategies in the spec for setting next timeout. + # https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/sdk.md#batching-processor. + # Shutdown will interrupt this sleep. Emit will interrupt this sleep only if the queue is bigger then threshold. + sleep_interrupted = self._worker_sleep.wait(self._schedule_delay) + if self._shutdown: break - - def _export_batch(self) -> int: - """Exports at most max_export_batch_size logs and returns the number of - exported logs. - """ - idx = 0 - while idx < self._max_export_batch_size and self._queue: - record = self._queue.pop() - self._log_records[idx] = record - idx += 1 - token = attach(set_value(_SUPPRESS_INSTRUMENTATION_KEY, True)) - try: - self._exporter.export(self._log_records[:idx]) # type: ignore - except Exception: # pylint: disable=broad-exception-caught - _logger.exception("Exception while exporting logs.") - detach(token) - - for index in range(idx): - self._log_records[index] = None - return idx - - def _drain_queue(self): - """Export all elements until queue is empty. - - Can only be called from the worker thread context because it invokes - `export` that is not thread safe. - """ - while self._queue: - self._export_batch() - - def _get_and_unset_flush_request(self) -> Optional[_FlushRequest]: - flush_request = self._flush_request - self._flush_request = None - if flush_request is not None: - flush_request.num_log_records = len(self._queue) - return flush_request - - @staticmethod - def _notify_flush_request_finished( - flush_request: Optional[_FlushRequest] = None, - ): - if flush_request is not None: - flush_request.event.set() - - def _get_or_create_flush_request(self) -> _FlushRequest: - if self._flush_request is None: - self._flush_request = _FlushRequest() - return self._flush_request + self._export( + BatchLogExportStrategy.EXPORT_WHILE_BATCH_EXCEEDS_THRESHOLD + if sleep_interrupted + else BatchLogExportStrategy.EXPORT_AT_LEAST_ONE_BATCH + ) + self._worker_sleep.clear() + self._export(BatchLogExportStrategy.EXPORT_ALL) + + def _export(self, batch_strategy: BatchLogExportStrategy) -> None: + with self._export_lock: + iteration = 0 + # We could see concurrent export calls from worker and force_flush. We call _should_export_batch + # once the lock is obtained to see if we still need to make the requested export. + while self._should_export_batch(batch_strategy, iteration): + iteration += 1 + token = attach(set_value(_SUPPRESS_INSTRUMENTATION_KEY, True)) + try: + self._exporter.export( + [ + # Oldest records are at the back, so pop from there. + self._queue.pop() + for _ in range( + min( + self._max_export_batch_size, + len(self._queue), + ) + ) + ] + ) + except Exception: # pylint: disable=broad-exception-caught + _logger.exception("Exception while exporting logs.") + detach(token) def emit(self, log_data: LogData) -> None: - """Adds the `LogData` to queue and notifies the waiting threads - when size of queue reaches max_export_batch_size. - """ if self._shutdown: + _logger.warning("Shutdown called, ignoring log.") return if self._pid != os.getpid(): _BSP_RESET_ONCE.do_once(self._at_fork_reinit) + if len(self._queue) == self._max_queue_size: + _logger.warning("Queue full, dropping log.") self._queue.appendleft(log_data) if len(self._queue) >= self._max_export_batch_size: - with self._condition: - self._condition.notify() + self._worker_sleep.set() def shutdown(self): + if self._shutdown: + return + # Prevents emit and force_flush from further calling export. self._shutdown = True - with self._condition: - self._condition.notify_all() + # Interrupts sleep in the worker, if it's sleeping. + self._worker_sleep.set() + # Main worker loop should exit after one final export call with flush all strategy. self._worker_thread.join() self._exporter.shutdown() def force_flush(self, timeout_millis: Optional[int] = None) -> bool: - if timeout_millis is None: - timeout_millis = self._export_timeout_millis if self._shutdown: - return True - - with self._condition: - flush_request = self._get_or_create_flush_request() - self._condition.notify_all() - - ret = flush_request.event.wait(timeout_millis / 1e3) - if not ret: - _logger.warning("Timeout was exceeded in force_flush().") - return ret + return + # Blocking call to export. + self._export(BatchLogExportStrategy.EXPORT_ALL) @staticmethod def _default_max_queue_size(): diff --git a/opentelemetry-sdk/tests/logs/test_export.py b/opentelemetry-sdk/tests/logs/test_export.py index b9ec0ac2e7f..3d3c8de41e9 100644 --- a/opentelemetry-sdk/tests/logs/test_export.py +++ b/opentelemetry-sdk/tests/logs/test_export.py @@ -21,7 +21,7 @@ import unittest import weakref from concurrent.futures import ThreadPoolExecutor -from unittest.mock import Mock, patch +from unittest.mock import Mock, call, patch from opentelemetry._logs import SeverityNumber from opentelemetry.sdk import trace @@ -46,10 +46,14 @@ ) from opentelemetry.sdk.resources import Resource as SDKResource from opentelemetry.sdk.util.instrumentation import InstrumentationScope -from opentelemetry.test.concurrency_test import ConcurrencyTestBase from opentelemetry.trace import TraceFlags from opentelemetry.trace.span import INVALID_SPAN_CONTEXT +EMPTY_LOG = LogData( + log_record=LogRecord(), + instrumentation_scope=InstrumentationScope("example", "example"), +) + class TestSimpleLogRecordProcessor(unittest.TestCase): def test_simple_log_record_processor_default_level(self): @@ -328,7 +332,7 @@ def test_simple_log_record_processor_different_msg_types_with_formatter( self.assertEqual(expected, emitted) -class TestBatchLogRecordProcessor(ConcurrencyTestBase): +class TestBatchLogRecordProcessor(unittest.TestCase): def test_emit_call_log_record(self): exporter = InMemoryLogExporter() log_record_processor = Mock(wraps=BatchLogRecordProcessor(exporter)) @@ -353,7 +357,7 @@ def test_args(self): ) self.assertEqual(log_record_processor._exporter, exporter) self.assertEqual(log_record_processor._max_queue_size, 1024) - self.assertEqual(log_record_processor._schedule_delay_millis, 2500) + self.assertEqual(log_record_processor._schedule_delay, 2.5) self.assertEqual(log_record_processor._max_export_batch_size, 256) self.assertEqual(log_record_processor._export_timeout_millis, 15000) @@ -371,7 +375,7 @@ def test_env_vars(self): log_record_processor = BatchLogRecordProcessor(exporter) self.assertEqual(log_record_processor._exporter, exporter) self.assertEqual(log_record_processor._max_queue_size, 1024) - self.assertEqual(log_record_processor._schedule_delay_millis, 2500) + self.assertEqual(log_record_processor._schedule_delay, 2.5) self.assertEqual(log_record_processor._max_export_batch_size, 256) self.assertEqual(log_record_processor._export_timeout_millis, 15000) @@ -380,7 +384,7 @@ def test_args_defaults(self): log_record_processor = BatchLogRecordProcessor(exporter) self.assertEqual(log_record_processor._exporter, exporter) self.assertEqual(log_record_processor._max_queue_size, 2048) - self.assertEqual(log_record_processor._schedule_delay_millis, 5000) + self.assertEqual(log_record_processor._schedule_delay, 5) self.assertEqual(log_record_processor._max_export_batch_size, 512) self.assertEqual(log_record_processor._export_timeout_millis, 30000) @@ -400,7 +404,7 @@ def test_args_env_var_value_error(self): _logger.disabled = False self.assertEqual(log_record_processor._exporter, exporter) self.assertEqual(log_record_processor._max_queue_size, 2048) - self.assertEqual(log_record_processor._schedule_delay_millis, 5000) + self.assertEqual(log_record_processor._schedule_delay, 5) self.assertEqual(log_record_processor._max_export_batch_size, 512) self.assertEqual(log_record_processor._export_timeout_millis, 30000) @@ -415,7 +419,7 @@ def test_args_none_defaults(self): ) self.assertEqual(log_record_processor._exporter, exporter) self.assertEqual(log_record_processor._max_queue_size, 2048) - self.assertEqual(log_record_processor._schedule_delay_millis, 5000) + self.assertEqual(log_record_processor._schedule_delay, 5) self.assertEqual(log_record_processor._max_export_batch_size, 512) self.assertEqual(log_record_processor._export_timeout_millis, 30000) @@ -465,161 +469,126 @@ def test_validation_negative_max_queue_size(self): max_export_batch_size=101, ) - def test_shutdown(self): - exporter = InMemoryLogExporter() - log_record_processor = BatchLogRecordProcessor(exporter) - - provider = LoggerProvider() - provider.add_log_record_processor(log_record_processor) - - logger = logging.getLogger("shutdown") - logger.addHandler(LoggingHandler(logger_provider=provider)) - - with self.assertLogs(level=logging.WARNING): - logger.warning("warning message: %s", "possible upcoming heatwave") - with self.assertLogs(level=logging.WARNING): - logger.error("Very high rise in temperatures across the globe") - with self.assertLogs(level=logging.WARNING): - logger.critical("Temperature hits high 420 C in Hyderabad") + def test_logs_exported_once_batch_size_reached(self): + exporter = Mock() + log_record_processor = BatchLogRecordProcessor( + exporter=exporter, + max_queue_size=15, + max_export_batch_size=15, + # Will not reach this during the test, this sleep should be interrupted when batch size is reached. + schedule_delay_millis=30000, + ) + before_export = time.time_ns() + for _ in range(15): + log_record_processor.emit(EMPTY_LOG) + # Wait a bit for the worker thread to wake up and call export. + time.sleep(0.1) + exporter.export.assert_called_once() + after_export = time.time_ns() + # Shows the worker's 30 second sleep was interrupted within a second. + self.assertTrue((after_export - before_export) < 1e9) + + # pylint: disable=no-self-use + def test_logs_exported_once_schedule_delay_reached(self): + exporter = Mock() + log_record_processor = BatchLogRecordProcessor( + exporter=exporter, + # Should not reach this during the test, instead export should be called when delay millis is hit. + max_queue_size=15, + max_export_batch_size=15, + schedule_delay_millis=100, + ) + for _ in range(15): + log_record_processor.emit(EMPTY_LOG) + time.sleep(0.11) + exporter.export.assert_has_calls( + [call([EMPTY_LOG]) for _ in range(15)] + ) + def test_logs_flushed_before_shutdown_and_dropped_after_shutdown(self): + exporter = Mock() + log_record_processor = BatchLogRecordProcessor( + exporter=exporter, + # Neither of these thresholds should be hit before test ends. + max_queue_size=15, + max_export_batch_size=15, + schedule_delay_millis=30000, + ) + # This log should be flushed because it was written before shutdown. + log_record_processor.emit(EMPTY_LOG) log_record_processor.shutdown() self.assertTrue(exporter._stopped) - finished_logs = exporter.get_finished_logs() - expected = [ - ("warning message: possible upcoming heatwave", "WARN"), - ("Very high rise in temperatures across the globe", "ERROR"), - ( - "Temperature hits high 420 C in Hyderabad", - "CRITICAL", - ), - ] - emitted = [ - (item.log_record.body, item.log_record.severity_text) - for item in finished_logs - ] - self.assertEqual(expected, emitted) - for item in finished_logs: - self.assertEqual(item.instrumentation_scope.name, "shutdown") - - def test_force_flush(self): - exporter = InMemoryLogExporter() - log_record_processor = BatchLogRecordProcessor(exporter) - - provider = LoggerProvider() - provider.add_log_record_processor(log_record_processor) - - logger = logging.getLogger("force_flush") - logger.propagate = False - logger.addHandler(LoggingHandler(logger_provider=provider)) - - logger.critical("Earth is burning") - log_record_processor.force_flush() - finished_logs = exporter.get_finished_logs() - self.assertEqual(len(finished_logs), 1) - log_record = finished_logs[0].log_record - self.assertEqual(log_record.body, "Earth is burning") - self.assertEqual(log_record.severity_number, SeverityNumber.FATAL) - self.assertEqual( - finished_logs[0].instrumentation_scope.name, "force_flush" + with self.assertLogs(level="WARNING") as log: + # This log should not be flushed. + log_record_processor.emit(EMPTY_LOG) + self.assertEqual(len(log.output), 1) + self.assertEqual(len(log.records), 1) + self.assertIn("Shutdown called, ignoring log.", log.output[0]) + exporter.export.assert_called_once_with([EMPTY_LOG]) + + # pylint: disable=no-self-use + def test_force_flush_flushes_logs(self): + exporter = Mock() + log_record_processor = BatchLogRecordProcessor( + exporter=exporter, + # Neither of these thresholds should be hit before test ends. + max_queue_size=15, + max_export_batch_size=15, + schedule_delay_millis=30000, ) - - def test_log_record_processor_too_many_logs(self): - exporter = InMemoryLogExporter() - log_record_processor = BatchLogRecordProcessor(exporter) - - provider = LoggerProvider() - provider.add_log_record_processor(log_record_processor) - - logger = logging.getLogger("many_logs") - logger.propagate = False - logger.addHandler(LoggingHandler(logger_provider=provider)) - - for log_no in range(1000): - logger.critical("Log no: %s", log_no) - - self.assertTrue(log_record_processor.force_flush()) - finised_logs = exporter.get_finished_logs() - self.assertEqual(len(finised_logs), 1000) - for item in finised_logs: - self.assertEqual(item.instrumentation_scope.name, "many_logs") + for _ in range(10): + log_record_processor.emit(EMPTY_LOG) + log_record_processor.force_flush() + exporter.export.assert_called_once_with([EMPTY_LOG for _ in range(10)]) def test_with_multiple_threads(self): exporter = InMemoryLogExporter() log_record_processor = BatchLogRecordProcessor(exporter) - provider = LoggerProvider() - provider.add_log_record_processor(log_record_processor) - - logger = logging.getLogger("threads") - logger.propagate = False - logger.addHandler(LoggingHandler(logger_provider=provider)) - def bulk_log_and_flush(num_logs): for _ in range(num_logs): - logger.critical("Critical message") - self.assertTrue(log_record_processor.force_flush()) + log_record_processor.emit(EMPTY_LOG) + log_record_processor.force_flush() with ThreadPoolExecutor(max_workers=69) as executor: - futures = [] for idx in range(69): - future = executor.submit(bulk_log_and_flush, idx + 1) - futures.append(future) - + executor.submit(bulk_log_and_flush, idx + 1) executor.shutdown() finished_logs = exporter.get_finished_logs() self.assertEqual(len(finished_logs), 2415) - for item in finished_logs: - self.assertEqual(item.instrumentation_scope.name, "threads") @unittest.skipUnless( hasattr(os, "fork"), "needs *nix", ) def test_batch_log_record_processor_fork(self): - # pylint: disable=invalid-name exporter = InMemoryLogExporter() log_record_processor = BatchLogRecordProcessor( exporter, max_export_batch_size=64, - schedule_delay_millis=10, + schedule_delay_millis=30000, ) - provider = LoggerProvider() - provider.add_log_record_processor(log_record_processor) - - logger = logging.getLogger("test-fork") - logger.propagate = False - logger.addHandler(LoggingHandler(logger_provider=provider)) - - logger.critical("yolo") - time.sleep(0.5) # give some time for the exporter to upload - - self.assertTrue(log_record_processor.force_flush()) - self.assertEqual(len(exporter.get_finished_logs()), 1) - exporter.clear() - + # These are not expected to be flushed. Calling fork clears any logs not flushed. + for _ in range(10): + log_record_processor.emit(EMPTY_LOG) multiprocessing.set_start_method("fork") def child(conn): - def _target(): - logger.critical("Critical message child") - - self.run_with_many_threads(_target, 100) - - time.sleep(0.5) - + for _ in range(100): + log_record_processor.emit(EMPTY_LOG) + log_record_processor.force_flush() logs = exporter.get_finished_logs() conn.send(len(logs) == 100) conn.close() parent_conn, child_conn = multiprocessing.Pipe() - p = multiprocessing.Process(target=child, args=(child_conn,)) - p.start() + process = multiprocessing.Process(target=child, args=(child_conn,)) + process.start() self.assertTrue(parent_conn.recv()) - p.join() - - log_record_processor.shutdown() + process.join() + self.assertTrue(len(exporter.get_finished_logs()) == 0) def test_batch_log_record_processor_gc(self): # Given a BatchLogRecordProcessor @@ -680,11 +649,5 @@ def formatter(record): # pylint: disable=unused-argument mock_stdout = Mock() exporter = ConsoleLogExporter(out=mock_stdout, formatter=formatter) - log_data = LogData( - log_record=LogRecord(), - instrumentation_scope=InstrumentationScope( - "first_name", "first_version" - ), - ) - exporter.export([log_data]) + exporter.export([EMPTY_LOG]) mock_stdout.write.assert_called_once_with(mock_record_str) From 072c6bc45a2df4f9d90229104c665274fe119cb9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 11 Apr 2025 14:31:31 +0000 Subject: [PATCH 09/21] build(deps): bump jinja2 (#4534) Bumps [jinja2](https://github.com/pallets/jinja) from 3.1.5 to 3.1.6. - [Release notes](https://github.com/pallets/jinja/releases) - [Changelog](https://github.com/pallets/jinja/blob/main/CHANGES.rst) - [Commits](https://github.com/pallets/jinja/compare/3.1.5...3.1.6) --- updated-dependencies: - dependency-name: jinja2 dependency-version: 3.1.6 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Riccardo Magliocchetti --- .../examples/fork-process-model/flask-gunicorn/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/examples/fork-process-model/flask-gunicorn/requirements.txt b/docs/examples/fork-process-model/flask-gunicorn/requirements.txt index 5146eabd116..e1dd8724a75 100644 --- a/docs/examples/fork-process-model/flask-gunicorn/requirements.txt +++ b/docs/examples/fork-process-model/flask-gunicorn/requirements.txt @@ -4,7 +4,7 @@ googleapis-common-protos==1.52.0 grpcio==1.56.2 gunicorn==22.0.0 itsdangerous==2.1.2 -Jinja2==3.1.5 +Jinja2==3.1.6 MarkupSafe==2.1.3 opentelemetry-api==1.20.0 opentelemetry-exporter-otlp==1.20.0 From ea17936cd50173c130cfa3c87815ef27722496ba Mon Sep 17 00:00:00 2001 From: Riccardo Magliocchetti Date: Fri, 18 Apr 2025 09:20:49 +0200 Subject: [PATCH 10/21] logs: fix serialization of Extended attributes (#4342) * logs: introduce LogAttributes type Logs attribute accepts AnyValue as AttributeValue add a type to describe that and start using it. * LogAttributes -> ExtendedAttributes * Handle ExtendedAttributes in BoundedAttributes * opentelemetry-sdk: serialize extended attributes * Add changelog * Fix typing * Fix handling of not attribute values inside sequences * Please mypy * Please lint * More typing * Even more typing fixes * Fix docs * Fix mypy * Update LogRecord attributes typing to match reality * More typing * Move changelog to unreleased * ExtendedAttributes -> _ExtendedAttributes * opentelemetry-sdk: keep instrumentation scope attributes as Attributes * exporter/otlp: allow export of none values in logs attributes --- CHANGELOG.md | 3 + docs/conf.py | 4 + .../otlp/proto/common/_internal/__init__.py | 9 +- .../common/_internal/_log_encoder/__init__.py | 4 +- .../tests/test_log_encoder.py | 78 +++++++++- .../src/opentelemetry/_events/__init__.py | 27 ++-- .../opentelemetry/_logs/_internal/__init__.py | 22 +-- .../src/opentelemetry/attributes/__init__.py | 135 +++++++++++++++--- .../src/opentelemetry/util/types.py | 2 + .../tests/attributes/test_attributes.py | 107 +++++++++++++- .../tests/events/test_proxy_event.py | 4 +- opentelemetry-api/tests/logs/test_proxy.py | 4 +- .../src/opentelemetry/sdk/_events/__init__.py | 6 +- .../sdk/_logs/_internal/__init__.py | 20 +-- .../tests/logs/test_log_record.py | 15 +- 15 files changed, 370 insertions(+), 70 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 38a77be0937..d9e89b18034 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +- Fix serialization of extended attributes for logs signal + ([#4342](https://github.com/open-telemetry/opentelemetry-python/pull/4342)) + ## Version 1.32.0/0.53b0 (2025-04-10) - Fix user agent in OTLP HTTP metrics exporter diff --git a/docs/conf.py b/docs/conf.py index 5e8037488bf..0a739269036 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -154,6 +154,10 @@ "py:class", "_contextvars.Token", ), + ( + "py:class", + "AnyValue", + ), ] # Add any paths that contain templates here, relative to this directory. diff --git a/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/_internal/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/_internal/__init__.py index d1793a734ad..2f49502cf1d 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/_internal/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/_internal/__init__.py @@ -45,7 +45,7 @@ ) from opentelemetry.sdk.trace import Resource from opentelemetry.sdk.util.instrumentation import InstrumentationScope -from opentelemetry.util.types import Attributes +from opentelemetry.util.types import _ExtendedAttributes _logger = logging.getLogger(__name__) @@ -136,14 +136,17 @@ def _encode_trace_id(trace_id: int) -> bytes: def _encode_attributes( - attributes: Attributes, + attributes: _ExtendedAttributes, + allow_null: bool = False, ) -> Optional[List[PB2KeyValue]]: if attributes: pb2_attributes = [] for key, value in attributes.items(): # pylint: disable=broad-exception-caught try: - pb2_attributes.append(_encode_key_value(key, value)) + pb2_attributes.append( + _encode_key_value(key, value, allow_null=allow_null) + ) except Exception as error: _logger.exception("Failed to encode key %s: %s", key, error) else: diff --git a/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/_internal/_log_encoder/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/_internal/_log_encoder/__init__.py index 9cd44844d06..9d713cb7ff0 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/_internal/_log_encoder/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-common/src/opentelemetry/exporter/otlp/proto/common/_internal/_log_encoder/__init__.py @@ -57,7 +57,9 @@ def _encode_log(log_data: LogData) -> PB2LogRecord: flags=int(log_data.log_record.trace_flags), body=_encode_value(body, allow_null=True), severity_text=log_data.log_record.severity_text, - attributes=_encode_attributes(log_data.log_record.attributes), + attributes=_encode_attributes( + log_data.log_record.attributes, allow_null=True + ), dropped_attributes_count=log_data.log_record.dropped_attributes, severity_number=log_data.log_record.severity_number.value, ) diff --git a/exporter/opentelemetry-exporter-otlp-proto-common/tests/test_log_encoder.py b/exporter/opentelemetry-exporter-otlp-proto-common/tests/test_log_encoder.py index 2c4e39eab10..4c2b54aad2b 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-common/tests/test_log_encoder.py +++ b/exporter/opentelemetry-exporter-otlp-proto-common/tests/test_log_encoder.py @@ -225,7 +225,28 @@ def _get_sdk_log_data() -> List[LogData]: ), ) - return [log1, log2, log3, log4, log5, log6, log7] + log8 = LogData( + log_record=SDKLogRecord( + timestamp=1644650584292683044, + observed_timestamp=1644650584292683044, + trace_id=212592107417388365804938480559624925566, + span_id=6077757853989569466, + trace_flags=TraceFlags(0x01), + severity_text="INFO", + severity_number=SeverityNumber.INFO, + body="Test export of extended attributes", + resource=SDKResource({}), + attributes={ + "extended": { + "sequence": [{"inner": "mapping", "none": None}] + } + }, + ), + instrumentation_scope=InstrumentationScope( + "extended_name", "extended_version" + ), + ) + return [log1, log2, log3, log4, log5, log6, log7, log8] def get_test_logs( self, @@ -265,7 +286,8 @@ def get_test_logs( "Do not go gentle into that good night. Rage, rage against the dying of the light" ), attributes=_encode_attributes( - {"a": 1, "b": "c"} + {"a": 1, "b": "c"}, + allow_null=True, ), ) ], @@ -295,7 +317,8 @@ def get_test_logs( { "filename": "model.py", "func_name": "run_method", - } + }, + allow_null=True, ), ) ], @@ -326,7 +349,8 @@ def get_test_logs( { "filename": "model.py", "func_name": "run_method", - } + }, + allow_null=True, ), ) ], @@ -336,7 +360,8 @@ def get_test_logs( name="scope_with_attributes", version="scope_with_attributes_version", attributes=_encode_attributes( - {"one": 1, "two": "2"} + {"one": 1, "two": "2"}, + allow_null=True, ), ), schema_url="instrumentation_schema_url", @@ -360,7 +385,8 @@ def get_test_logs( { "filename": "model.py", "func_name": "run_method", - } + }, + allow_null=True, ), ) ], @@ -416,7 +442,8 @@ def get_test_logs( severity_number=SeverityNumber.DEBUG.value, body=_encode_value("To our galaxy"), attributes=_encode_attributes( - {"a": 1, "b": "c"} + {"a": 1, "b": "c"}, + allow_null=True, ), ), ], @@ -471,6 +498,43 @@ def get_test_logs( ), ], ), + PB2ScopeLogs( + scope=PB2InstrumentationScope( + name="extended_name", + version="extended_version", + ), + log_records=[ + PB2LogRecord( + time_unix_nano=1644650584292683044, + observed_time_unix_nano=1644650584292683044, + trace_id=_encode_trace_id( + 212592107417388365804938480559624925566 + ), + span_id=_encode_span_id( + 6077757853989569466, + ), + flags=int(TraceFlags(0x01)), + severity_text="INFO", + severity_number=SeverityNumber.INFO.value, + body=_encode_value( + "Test export of extended attributes" + ), + attributes=_encode_attributes( + { + "extended": { + "sequence": [ + { + "inner": "mapping", + "none": None, + } + ] + } + }, + allow_null=True, + ), + ), + ], + ), ], ), ] diff --git a/opentelemetry-api/src/opentelemetry/_events/__init__.py b/opentelemetry-api/src/opentelemetry/_events/__init__.py index e1e6a675a52..f073b223345 100644 --- a/opentelemetry-api/src/opentelemetry/_events/__init__.py +++ b/opentelemetry-api/src/opentelemetry/_events/__init__.py @@ -15,7 +15,7 @@ from abc import ABC, abstractmethod from logging import getLogger from os import environ -from typing import Any, Optional, cast +from typing import Optional, cast from opentelemetry._logs import LogRecord from opentelemetry._logs.severity import SeverityNumber @@ -25,7 +25,7 @@ from opentelemetry.trace.span import TraceFlags from opentelemetry.util._once import Once from opentelemetry.util._providers import _load_provider -from opentelemetry.util.types import Attributes +from opentelemetry.util.types import AnyValue, _ExtendedAttributes _logger = getLogger(__name__) @@ -38,18 +38,21 @@ def __init__( trace_id: Optional[int] = None, span_id: Optional[int] = None, trace_flags: Optional["TraceFlags"] = None, - body: Optional[Any] = None, + body: Optional[AnyValue] = None, severity_number: Optional[SeverityNumber] = None, - attributes: Optional[Attributes] = None, + attributes: Optional[_ExtendedAttributes] = None, ): attributes = attributes or {} - event_attributes = {**attributes, "event.name": name} + event_attributes = { + **attributes, + "event.name": name, + } super().__init__( timestamp=timestamp, trace_id=trace_id, span_id=span_id, trace_flags=trace_flags, - body=body, # type: ignore + body=body, severity_number=severity_number, attributes=event_attributes, ) @@ -62,7 +65,7 @@ def __init__( name: str, version: Optional[str] = None, schema_url: Optional[str] = None, - attributes: Optional[Attributes] = None, + attributes: Optional[_ExtendedAttributes] = None, ): self._name = name self._version = version @@ -85,7 +88,7 @@ def __init__( name: str, version: Optional[str] = None, schema_url: Optional[str] = None, - attributes: Optional[Attributes] = None, + attributes: Optional[_ExtendedAttributes] = None, ): super().__init__( name=name, @@ -122,7 +125,7 @@ def get_event_logger( name: str, version: Optional[str] = None, schema_url: Optional[str] = None, - attributes: Optional[Attributes] = None, + attributes: Optional[_ExtendedAttributes] = None, ) -> EventLogger: """Returns an EventLoggerProvider for use.""" @@ -133,7 +136,7 @@ def get_event_logger( name: str, version: Optional[str] = None, schema_url: Optional[str] = None, - attributes: Optional[Attributes] = None, + attributes: Optional[_ExtendedAttributes] = None, ) -> EventLogger: return NoOpEventLogger( name, version=version, schema_url=schema_url, attributes=attributes @@ -146,7 +149,7 @@ def get_event_logger( name: str, version: Optional[str] = None, schema_url: Optional[str] = None, - attributes: Optional[Attributes] = None, + attributes: Optional[_ExtendedAttributes] = None, ) -> EventLogger: if _EVENT_LOGGER_PROVIDER: return _EVENT_LOGGER_PROVIDER.get_event_logger( @@ -208,7 +211,7 @@ def get_event_logger( name: str, version: Optional[str] = None, schema_url: Optional[str] = None, - attributes: Optional[Attributes] = None, + attributes: Optional[_ExtendedAttributes] = None, event_logger_provider: Optional[EventLoggerProvider] = None, ) -> "EventLogger": if event_logger_provider is None: diff --git a/opentelemetry-api/src/opentelemetry/_logs/_internal/__init__.py b/opentelemetry-api/src/opentelemetry/_logs/_internal/__init__.py index f20bd8507e5..71fc97b0aaa 100644 --- a/opentelemetry-api/src/opentelemetry/_logs/_internal/__init__.py +++ b/opentelemetry-api/src/opentelemetry/_logs/_internal/__init__.py @@ -37,14 +37,14 @@ from logging import getLogger from os import environ from time import time_ns -from typing import Any, Optional, cast +from typing import Optional, cast from opentelemetry._logs.severity import SeverityNumber from opentelemetry.environment_variables import _OTEL_PYTHON_LOGGER_PROVIDER from opentelemetry.trace.span import TraceFlags from opentelemetry.util._once import Once from opentelemetry.util._providers import _load_provider -from opentelemetry.util.types import Attributes +from opentelemetry.util.types import AnyValue, _ExtendedAttributes _logger = getLogger(__name__) @@ -66,8 +66,8 @@ def __init__( trace_flags: Optional["TraceFlags"] = None, severity_text: Optional[str] = None, severity_number: Optional[SeverityNumber] = None, - body: Optional[Any] = None, - attributes: Optional["Attributes"] = None, + body: AnyValue = None, + attributes: Optional[_ExtendedAttributes] = None, ): self.timestamp = timestamp if observed_timestamp is None: @@ -78,7 +78,7 @@ def __init__( self.trace_flags = trace_flags self.severity_text = severity_text self.severity_number = severity_number - self.body = body # type: ignore + self.body = body self.attributes = attributes @@ -90,7 +90,7 @@ def __init__( name: str, version: Optional[str] = None, schema_url: Optional[str] = None, - attributes: Optional[Attributes] = None, + attributes: Optional[_ExtendedAttributes] = None, ) -> None: super().__init__() self._name = name @@ -119,7 +119,7 @@ def __init__( # pylint: disable=super-init-not-called name: str, version: Optional[str] = None, schema_url: Optional[str] = None, - attributes: Optional[Attributes] = None, + attributes: Optional[_ExtendedAttributes] = None, ): self._name = name self._version = version @@ -158,7 +158,7 @@ def get_logger( name: str, version: Optional[str] = None, schema_url: Optional[str] = None, - attributes: Optional[Attributes] = None, + attributes: Optional[_ExtendedAttributes] = None, ) -> Logger: """Returns a `Logger` for use by the given instrumentation library. @@ -196,7 +196,7 @@ def get_logger( name: str, version: Optional[str] = None, schema_url: Optional[str] = None, - attributes: Optional[Attributes] = None, + attributes: Optional[_ExtendedAttributes] = None, ) -> Logger: """Returns a NoOpLogger.""" return NoOpLogger( @@ -210,7 +210,7 @@ def get_logger( name: str, version: Optional[str] = None, schema_url: Optional[str] = None, - attributes: Optional[Attributes] = None, + attributes: Optional[_ExtendedAttributes] = None, ) -> Logger: if _LOGGER_PROVIDER: return _LOGGER_PROVIDER.get_logger( @@ -273,7 +273,7 @@ def get_logger( instrumenting_library_version: str = "", logger_provider: Optional[LoggerProvider] = None, schema_url: Optional[str] = None, - attributes: Optional[Attributes] = None, + attributes: Optional[_ExtendedAttributes] = None, ) -> "Logger": """Returns a `Logger` for use within a python process. diff --git a/opentelemetry-api/src/opentelemetry/attributes/__init__.py b/opentelemetry-api/src/opentelemetry/attributes/__init__.py index 71121f84697..fc3d494631a 100644 --- a/opentelemetry-api/src/opentelemetry/attributes/__init__.py +++ b/opentelemetry-api/src/opentelemetry/attributes/__init__.py @@ -118,6 +118,98 @@ def _clean_attribute( return None +def _clean_extended_attribute_value( + value: types.AnyValue, max_len: Optional[int] +) -> types.AnyValue: + # for primitive types just return the value and eventually shorten the string length + if value is None or isinstance(value, _VALID_ATTR_VALUE_TYPES): + if max_len is not None and isinstance(value, str): + value = value[:max_len] + return value + + if isinstance(value, Mapping): + cleaned_dict: dict[str, types.AnyValue] = {} + for key, element in value.items(): + # skip invalid keys + if not (key and isinstance(key, str)): + _logger.warning( + "invalid key `%s`. must be non-empty string.", key + ) + continue + + cleaned_dict[key] = _clean_extended_attribute( + key=key, value=element, max_len=max_len + ) + + return cleaned_dict + + if isinstance(value, Sequence): + sequence_first_valid_type = None + cleaned_seq: list[types.AnyValue] = [] + + for element in value: + if element is None: + cleaned_seq.append(element) + continue + + if max_len is not None and isinstance(element, str): + element = element[:max_len] + + element_type = type(element) + if element_type not in _VALID_ATTR_VALUE_TYPES: + element = _clean_extended_attribute_value( + element, max_len=max_len + ) + element_type = type(element) # type: ignore + + # The type of the sequence must be homogeneous. The first non-None + # element determines the type of the sequence + if sequence_first_valid_type is None: + sequence_first_valid_type = element_type + # use equality instead of isinstance as isinstance(True, int) evaluates to True + elif element_type != sequence_first_valid_type: + _logger.warning( + "Mixed types %s and %s in attribute value sequence", + sequence_first_valid_type.__name__, + type(element).__name__, + ) + return None + + cleaned_seq.append(element) + + # Freeze mutable sequences defensively + return tuple(cleaned_seq) + + raise TypeError( + f"Invalid type {type(value).__name__} for attribute value. " + f"Expected one of {[valid_type.__name__ for valid_type in _VALID_ANY_VALUE_TYPES]} or a " + "sequence of those types", + ) + + +def _clean_extended_attribute( + key: str, value: types.AnyValue, max_len: Optional[int] +) -> types.AnyValue: + """Checks if attribute value is valid and cleans it if required. + + The function returns the cleaned value or None if the value is not valid. + + An attribute value is valid if it is an AnyValue. + An attribute needs cleansing if: + - Its length is greater than the maximum allowed length. + """ + + if not (key and isinstance(key, str)): + _logger.warning("invalid key `%s`. must be non-empty string.", key) + return None + + try: + return _clean_extended_attribute_value(value, max_len=max_len) + except TypeError as exception: + _logger.warning("Attribute %s: %s", key, exception) + return None + + def _clean_attribute_value( value: types.AttributeValue, limit: Optional[int] ) -> Optional[types.AttributeValue]: @@ -146,9 +238,10 @@ class BoundedAttributes(MutableMapping): # type: ignore def __init__( self, maxlen: Optional[int] = None, - attributes: types.Attributes = None, + attributes: Optional[types._ExtendedAttributes] = None, immutable: bool = True, max_value_len: Optional[int] = None, + extended_attributes: bool = False, ): if maxlen is not None: if not isinstance(maxlen, int) or maxlen < 0: @@ -158,11 +251,12 @@ def __init__( self.maxlen = maxlen self.dropped = 0 self.max_value_len = max_value_len + self._extended_attributes = extended_attributes # OrderedDict is not used until the maxlen is reached for efficiency. self._dict: Union[ - MutableMapping[str, types.AttributeValue], - OrderedDict[str, types.AttributeValue], + MutableMapping[str, types.AnyValue], + OrderedDict[str, types.AnyValue], ] = {} self._lock = threading.RLock() if attributes: @@ -173,10 +267,10 @@ def __init__( def __repr__(self) -> str: return f"{dict(self._dict)}" - def __getitem__(self, key: str) -> types.AttributeValue: + def __getitem__(self, key: str) -> types.AnyValue: return self._dict[key] - def __setitem__(self, key: str, value: types.AttributeValue) -> None: + def __setitem__(self, key: str, value: types.AnyValue) -> None: if getattr(self, "_immutable", False): # type: ignore raise TypeError with self._lock: @@ -184,19 +278,24 @@ def __setitem__(self, key: str, value: types.AttributeValue) -> None: self.dropped += 1 return - value = _clean_attribute(key, value, self.max_value_len) # type: ignore - if value is not None: - if key in self._dict: - del self._dict[key] - elif ( - self.maxlen is not None and len(self._dict) == self.maxlen - ): - if not isinstance(self._dict, OrderedDict): - self._dict = OrderedDict(self._dict) - self._dict.popitem(last=False) # type: ignore - self.dropped += 1 - - self._dict[key] = value # type: ignore + if self._extended_attributes: + value = _clean_extended_attribute( + key, value, self.max_value_len + ) + else: + value = _clean_attribute(key, value, self.max_value_len) # type: ignore + if value is None: + return + + if key in self._dict: + del self._dict[key] + elif self.maxlen is not None and len(self._dict) == self.maxlen: + if not isinstance(self._dict, OrderedDict): + self._dict = OrderedDict(self._dict) + self._dict.popitem(last=False) # type: ignore + self.dropped += 1 + + self._dict[key] = value # type: ignore def __delitem__(self, key: str) -> None: if getattr(self, "_immutable", False): # type: ignore diff --git a/opentelemetry-api/src/opentelemetry/util/types.py b/opentelemetry-api/src/opentelemetry/util/types.py index be311faf555..7455c741c93 100644 --- a/opentelemetry-api/src/opentelemetry/util/types.py +++ b/opentelemetry-api/src/opentelemetry/util/types.py @@ -55,3 +55,5 @@ ], ..., ] + +_ExtendedAttributes = Mapping[str, "AnyValue"] diff --git a/opentelemetry-api/tests/attributes/test_attributes.py b/opentelemetry-api/tests/attributes/test_attributes.py index cf6aecb41fa..8a653387254 100644 --- a/opentelemetry-api/tests/attributes/test_attributes.py +++ b/opentelemetry-api/tests/attributes/test_attributes.py @@ -17,7 +17,11 @@ import unittest from typing import MutableSequence -from opentelemetry.attributes import BoundedAttributes, _clean_attribute +from opentelemetry.attributes import ( + BoundedAttributes, + _clean_attribute, + _clean_extended_attribute, +) class TestAttributes(unittest.TestCase): @@ -89,6 +93,96 @@ def test_sequence_attr_decode(self): ) +class TestExtendedAttributes(unittest.TestCase): + # pylint: disable=invalid-name + def assertValid(self, value, key="k"): + expected = value + if isinstance(value, MutableSequence): + expected = tuple(value) + self.assertEqual(_clean_extended_attribute(key, value, None), expected) + + def assertInvalid(self, value, key="k"): + self.assertIsNone(_clean_extended_attribute(key, value, None)) + + def test_attribute_key_validation(self): + # only non-empty strings are valid keys + self.assertInvalid(1, "") + self.assertInvalid(1, 1) + self.assertInvalid(1, {}) + self.assertInvalid(1, []) + self.assertInvalid(1, b"1") + self.assertValid(1, "k") + self.assertValid(1, "1") + + def test_clean_extended_attribute(self): + self.assertInvalid([1, 2, 3.4, "ss", 4]) + self.assertInvalid([{}, 1, 2, 3.4, 4]) + self.assertInvalid(["sw", "lf", 3.4, "ss"]) + self.assertInvalid([1, 2, 3.4, 5]) + self.assertInvalid([1, True]) + self.assertValid(None) + self.assertValid(True) + self.assertValid("hi") + self.assertValid(3.4) + self.assertValid(15) + self.assertValid([1, 2, 3, 5]) + self.assertValid([1.2, 2.3, 3.4, 4.5]) + self.assertValid([True, False]) + self.assertValid(["ss", "dw", "fw"]) + self.assertValid([]) + # None in sequences are valid + self.assertValid(["A", None, None]) + self.assertValid(["A", None, None, "B"]) + self.assertValid([None, None]) + self.assertInvalid(["A", None, 1]) + self.assertInvalid([None, "A", None, 1]) + # mappings + self.assertValid({}) + self.assertValid({"k": "v"}) + # mappings in sequences + self.assertValid([{"k": "v"}]) + + # test keys + self.assertValid("value", "key") + self.assertInvalid("value", "") + self.assertInvalid("value", None) + + def test_sequence_attr_decode(self): + seq = [ + None, + b"Content-Disposition", + b"Content-Type", + b"\x81", + b"Keep-Alive", + ] + self.assertEqual( + _clean_extended_attribute("headers", seq, None), tuple(seq) + ) + + def test_mapping(self): + mapping = { + "": "invalid", + b"bytes": "invalid", + "none": {"": "invalid"}, + "valid_primitive": "str", + "valid_sequence": ["str"], + "invalid_sequence": ["str", 1], + "valid_mapping": {"str": 1}, + "invalid_mapping": {"": 1}, + } + expected = { + "none": {}, + "valid_primitive": "str", + "valid_sequence": ("str",), + "invalid_sequence": None, + "valid_mapping": {"str": 1}, + "invalid_mapping": {}, + } + self.assertEqual( + _clean_extended_attribute("headers", mapping, None), expected + ) + + class TestBoundedAttributes(unittest.TestCase): # pylint: disable=consider-using-dict-items base = { @@ -196,3 +290,14 @@ def test_locking(self): for num in range(100): self.assertEqual(bdict[str(num)], num) + + # pylint: disable=no-self-use + def test_extended_attributes(self): + bdict = BoundedAttributes(extended_attributes=True, immutable=False) + with unittest.mock.patch( + "opentelemetry.attributes._clean_extended_attribute", + return_value="mock_value", + ) as clean_extended_attribute_mock: + bdict["key"] = "value" + + clean_extended_attribute_mock.assert_called_once() diff --git a/opentelemetry-api/tests/events/test_proxy_event.py b/opentelemetry-api/tests/events/test_proxy_event.py index 736dcf35d60..44121a97d46 100644 --- a/opentelemetry-api/tests/events/test_proxy_event.py +++ b/opentelemetry-api/tests/events/test_proxy_event.py @@ -4,7 +4,7 @@ import opentelemetry._events as events from opentelemetry.test.globals_test import EventsGlobalsTest -from opentelemetry.util.types import Attributes +from opentelemetry.util.types import _ExtendedAttributes class TestProvider(events.NoOpEventLoggerProvider): @@ -13,7 +13,7 @@ def get_event_logger( name: str, version: typing.Optional[str] = None, schema_url: typing.Optional[str] = None, - attributes: typing.Optional[Attributes] = None, + attributes: typing.Optional[_ExtendedAttributes] = None, ) -> events.EventLogger: return LoggerTest(name) diff --git a/opentelemetry-api/tests/logs/test_proxy.py b/opentelemetry-api/tests/logs/test_proxy.py index 8e87ceb96ea..64c024c3fa1 100644 --- a/opentelemetry-api/tests/logs/test_proxy.py +++ b/opentelemetry-api/tests/logs/test_proxy.py @@ -19,7 +19,7 @@ import opentelemetry._logs._internal as _logs_internal from opentelemetry import _logs from opentelemetry.test.globals_test import LoggingGlobalsTest -from opentelemetry.util.types import Attributes +from opentelemetry.util.types import _ExtendedAttributes class TestProvider(_logs.NoOpLoggerProvider): @@ -28,7 +28,7 @@ def get_logger( name: str, version: typing.Optional[str] = None, schema_url: typing.Optional[str] = None, - attributes: typing.Optional[Attributes] = None, + attributes: typing.Optional[_ExtendedAttributes] = None, ) -> _logs.Logger: return LoggerTest(name) diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/_events/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/_events/__init__.py index ae16302546d..c427a48e2f8 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/_events/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/_events/__init__.py @@ -21,7 +21,7 @@ from opentelemetry._events import EventLoggerProvider as APIEventLoggerProvider from opentelemetry._logs import NoOpLogger, SeverityNumber, get_logger_provider from opentelemetry.sdk._logs import Logger, LoggerProvider, LogRecord -from opentelemetry.util.types import Attributes +from opentelemetry.util.types import _ExtendedAttributes _logger = logging.getLogger(__name__) @@ -33,7 +33,7 @@ def __init__( name: str, version: Optional[str] = None, schema_url: Optional[str] = None, - attributes: Optional[Attributes] = None, + attributes: Optional[_ExtendedAttributes] = None, ): super().__init__( name=name, @@ -74,7 +74,7 @@ def get_event_logger( name: str, version: Optional[str] = None, schema_url: Optional[str] = None, - attributes: Optional[Attributes] = None, + attributes: Optional[_ExtendedAttributes] = None, ) -> EventLogger: if not name: _logger.warning("EventLogger created with invalid name: %s", name) diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/__init__.py index 5d17c39f332..58872f68020 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/__init__.py @@ -24,7 +24,7 @@ from os import environ from threading import Lock from time import time_ns -from typing import Any, Callable, Tuple, Union # noqa +from typing import Any, Callable, Tuple, Union, cast # noqa from opentelemetry._logs import Logger as APILogger from opentelemetry._logs import LoggerProvider as APILoggerProvider @@ -52,7 +52,7 @@ get_current_span, ) from opentelemetry.trace.span import TraceFlags -from opentelemetry.util.types import AnyValue, Attributes +from opentelemetry.util.types import AnyValue, _ExtendedAttributes _logger = logging.getLogger(__name__) @@ -182,7 +182,7 @@ def __init__( severity_number: SeverityNumber | None = None, body: AnyValue | None = None, resource: Resource | None = None, - attributes: Attributes | None = None, + attributes: _ExtendedAttributes | None = None, limits: LogLimits | None = _UnsetLogLimits, ): super().__init__( @@ -200,6 +200,7 @@ def __init__( attributes=attributes if bool(attributes) else None, immutable=False, max_value_len=limits.max_attribute_length, + extended_attributes=True, ), } ) @@ -250,8 +251,11 @@ def to_json(self, indent: int | None = 4) -> str: @property def dropped_attributes(self) -> int: - if self.attributes: - return self.attributes.dropped + attributes: BoundedAttributes = cast( + BoundedAttributes, self.attributes + ) + if attributes: + return attributes.dropped return 0 @@ -477,7 +481,7 @@ def __init__( self._logger_provider = logger_provider or get_logger_provider() @staticmethod - def _get_attributes(record: logging.LogRecord) -> Attributes: + def _get_attributes(record: logging.LogRecord) -> _ExtendedAttributes: attributes = { k: v for k, v in vars(record).items() if k not in _RESERVED_ATTRS } @@ -636,7 +640,7 @@ def _get_logger_no_cache( name: str, version: str | None = None, schema_url: str | None = None, - attributes: Attributes | None = None, + attributes: _ExtendedAttributes | None = None, ) -> Logger: return Logger( self._resource, @@ -670,7 +674,7 @@ def get_logger( name: str, version: str | None = None, schema_url: str | None = None, - attributes: Attributes | None = None, + attributes: _ExtendedAttributes | None = None, ) -> Logger: if self._disabled: return NoOpLogger( diff --git a/opentelemetry-sdk/tests/logs/test_log_record.py b/opentelemetry-sdk/tests/logs/test_log_record.py index f42d3a26ea4..4a0d58dc9b1 100644 --- a/opentelemetry-sdk/tests/logs/test_log_record.py +++ b/opentelemetry-sdk/tests/logs/test_log_record.py @@ -33,7 +33,12 @@ def test_log_record_to_json(self): "body": "a log line", "severity_number": None, "severity_text": None, - "attributes": None, + "attributes": { + "mapping": {"key": "value"}, + "none": None, + "sequence": [1, 2], + "str": "string", + }, "dropped_attributes": 0, "timestamp": "1970-01-01T00:00:00.000000Z", "observed_timestamp": "1970-01-01T00:00:00.000000Z", @@ -52,12 +57,18 @@ def test_log_record_to_json(self): observed_timestamp=0, body="a log line", resource=Resource({"service.name": "foo"}), + attributes={ + "mapping": {"key": "value"}, + "none": None, + "sequence": [1, 2], + "str": "string", + }, ) self.assertEqual(expected, actual.to_json(indent=4)) self.assertEqual( actual.to_json(indent=None), - '{"body": "a log line", "severity_number": null, "severity_text": null, "attributes": null, "dropped_attributes": 0, "timestamp": "1970-01-01T00:00:00.000000Z", "observed_timestamp": "1970-01-01T00:00:00.000000Z", "trace_id": "", "span_id": "", "trace_flags": null, "resource": {"attributes": {"service.name": "foo"}, "schema_url": ""}}', + '{"body": "a log line", "severity_number": null, "severity_text": null, "attributes": {"mapping": {"key": "value"}, "none": null, "sequence": [1, 2], "str": "string"}, "dropped_attributes": 0, "timestamp": "1970-01-01T00:00:00.000000Z", "observed_timestamp": "1970-01-01T00:00:00.000000Z", "trace_id": "", "span_id": "", "trace_flags": null, "resource": {"attributes": {"service.name": "foo"}, "schema_url": ""}}', ) def test_log_record_to_json_serializes_severity_number_as_int(self): From dae12881d910976733fd0a95d7a660db3c379620 Mon Sep 17 00:00:00 2001 From: Jay Clifford <45856600+Jayclifford345@users.noreply.github.com> Date: Wed, 23 Apr 2025 15:16:47 +0100 Subject: [PATCH 11/21] feat: Updated and added examples (logs and metrics) (#4559) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * added examples * Apply suggestions from code review Co-authored-by: Emídio Neto <9735060+emdneto@users.noreply.github.com> * feat: added examples for metrics and logs * fixed spelling * Update docs/examples/metrics/reader/README.rst Co-authored-by: Emídio Neto <9735060+emdneto@users.noreply.github.com> --------- Co-authored-by: Emídio Neto <9735060+emdneto@users.noreply.github.com> --- CHANGELOG.md | 2 + docs/examples/logs/README.rst | 101 ++++++++++++------ docs/examples/logs/example.py | 4 + docs/examples/logs/otel-collector-config.yaml | 2 +- docs/examples/metrics/reader/README.rst | 1 + .../metrics/reader/synchronous_gauge_read.py | 88 +++++++++++++++ 6 files changed, 163 insertions(+), 35 deletions(-) create mode 100644 docs/examples/metrics/reader/synchronous_gauge_read.py diff --git a/CHANGELOG.md b/CHANGELOG.md index d9e89b18034..256312c8c44 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fix serialization of extended attributes for logs signal ([#4342](https://github.com/open-telemetry/opentelemetry-python/pull/4342)) +- docs: updated and added to the metrics and log examples + ([#4559](https://github.com/open-telemetry/opentelemetry-python/pull/4559)) ## Version 1.32.0/0.53b0 (2025-04-10) diff --git a/docs/examples/logs/README.rst b/docs/examples/logs/README.rst index b61114733ec..e3cd86362b2 100644 --- a/docs/examples/logs/README.rst +++ b/docs/examples/logs/README.rst @@ -52,37 +52,70 @@ The resulting logs will appear in the output from the collector and look similar .. code-block:: sh - Resource SchemaURL: - Resource labels: - -> telemetry.sdk.language: STRING(python) - -> telemetry.sdk.name: STRING(opentelemetry) - -> telemetry.sdk.version: STRING(1.8.0) - -> service.name: STRING(shoppingcart) - -> service.instance.id: STRING(instance-12) - InstrumentationLibraryLogs #0 - InstrumentationLibraryMetrics SchemaURL: - InstrumentationLibrary __main__ 0.1 - LogRecord #0 - Timestamp: 2022-01-13 20:37:03.998733056 +0000 UTC - Severity: WARNING - ShortName: - Body: Jail zesty vixen who grabbed pay from quack. - Trace ID: - Span ID: - Flags: 0 - LogRecord #1 - Timestamp: 2022-01-13 20:37:04.082757888 +0000 UTC - Severity: ERROR - ShortName: - Body: The five boxing wizards jump quickly. - Trace ID: - Span ID: - Flags: 0 - LogRecord #2 - Timestamp: 2022-01-13 20:37:04.082979072 +0000 UTC - Severity: ERROR - ShortName: - Body: Hyderabad, we have a major problem. - Trace ID: 63491217958f126f727622e41d4460f3 - Span ID: d90c57d6e1ca4f6c - Flags: 1 + ResourceLog #0 + Resource SchemaURL: + Resource attributes: + -> telemetry.sdk.language: Str(python) + -> telemetry.sdk.name: Str(opentelemetry) + -> telemetry.sdk.version: Str(1.33.0.dev0) + -> service.name: Str(shoppingcart) + -> service.instance.id: Str(instance-12) + ScopeLogs #0 + ScopeLogs SchemaURL: + InstrumentationScope myapp.area2 + LogRecord #0 + ObservedTimestamp: 2025-04-22 12:16:57.315179 +0000 UTC + Timestamp: 2025-04-22 12:16:57.315152896 +0000 UTC + SeverityText: WARN + SeverityNumber: Warn(13) + Body: Str(Jail zesty vixen who grabbed pay from quack.) + Attributes: + -> code.filepath: Str(/Users/jayclifford/Repos/opentelemetry-python/docs/examples/logs/example.py) + -> code.function: Str() + -> code.lineno: Int(47) + Trace ID: + Span ID: + Flags: 0 + LogRecord #1 + ObservedTimestamp: 2025-04-22 12:16:57.31522 +0000 UTC + Timestamp: 2025-04-22 12:16:57.315213056 +0000 UTC + SeverityText: ERROR + SeverityNumber: Error(17) + Body: Str(The five boxing wizards jump quickly.) + Attributes: + -> code.filepath: Str(/Users/jayclifford/Repos/opentelemetry-python/docs/examples/logs/example.py) + -> code.function: Str() + -> code.lineno: Int(48) + Trace ID: + Span ID: + Flags: 0 + LogRecord #2 + ObservedTimestamp: 2025-04-22 12:16:57.315445 +0000 UTC + Timestamp: 2025-04-22 12:16:57.31543808 +0000 UTC + SeverityText: ERROR + SeverityNumber: Error(17) + Body: Str(Hyderabad, we have a major problem.) + Attributes: + -> code.filepath: Str(/Users/jayclifford/Repos/opentelemetry-python/docs/examples/logs/example.py) + -> code.function: Str() + -> code.lineno: Int(61) + Trace ID: 8a6739fffce895e694700944e2faf23e + Span ID: a45337020100cb63 + Flags: 1 + ScopeLogs #1 + ScopeLogs SchemaURL: + InstrumentationScope myapp.area1 + LogRecord #0 + ObservedTimestamp: 2025-04-22 12:16:57.315242 +0000 UTC + Timestamp: 2025-04-22 12:16:57.315234048 +0000 UTC + SeverityText: ERROR + SeverityNumber: Error(17) + Body: Str(I have custom attributes.) + Attributes: + -> user_id: Str(user-123) + -> code.filepath: Str(/Users/jayclifford/Repos/opentelemetry-python/docs/examples/logs/example.py) + -> code.function: Str() + -> code.lineno: Int(53) + Trace ID: + Span ID: + Flags: 0 diff --git a/docs/examples/logs/example.py b/docs/examples/logs/example.py index ba471ea7e69..c782d457533 100644 --- a/docs/examples/logs/example.py +++ b/docs/examples/logs/example.py @@ -47,6 +47,10 @@ logger2.warning("Jail zesty vixen who grabbed pay from quack.") logger2.error("The five boxing wizards jump quickly.") +# Log custom attributes +# Custom attributes are added on a per event basis +user_id = "user-123" +logger1.error("I have custom attributes.", extra={"user_id": user_id}) # Trace context correlation tracer = trace.get_tracer(__name__) diff --git a/docs/examples/logs/otel-collector-config.yaml b/docs/examples/logs/otel-collector-config.yaml index 50d29086415..64495c75091 100644 --- a/docs/examples/logs/otel-collector-config.yaml +++ b/docs/examples/logs/otel-collector-config.yaml @@ -6,7 +6,7 @@ receivers: exporters: debug: - verbosity: debug + verbosity: detailed processors: batch: diff --git a/docs/examples/metrics/reader/README.rst b/docs/examples/metrics/reader/README.rst index 4822fe77669..01a913f22a3 100644 --- a/docs/examples/metrics/reader/README.rst +++ b/docs/examples/metrics/reader/README.rst @@ -6,6 +6,7 @@ These examples show how to customize the metrics that are output by the SDK usin * preferred_aggregation.py: Shows how to configure the preferred aggregation for metric instrument types. * preferred_temporality.py: Shows how to configure the preferred temporality for metric instrument types. * preferred_exemplarfilter.py: Shows how to configure the exemplar filter. +* synchronous_gauge_read.py: Shows how to use `PeriodicExportingMetricReader` in a synchronous manner to explicitly control the collection of metrics. The source files of these examples are available :scm_web:`here `. diff --git a/docs/examples/metrics/reader/synchronous_gauge_read.py b/docs/examples/metrics/reader/synchronous_gauge_read.py new file mode 100644 index 00000000000..d45f7ff00da --- /dev/null +++ b/docs/examples/metrics/reader/synchronous_gauge_read.py @@ -0,0 +1,88 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from typing import Iterable + +from opentelemetry.metrics import ( + CallbackOptions, + Observation, + get_meter_provider, + set_meter_provider, +) +from opentelemetry.sdk.metrics import MeterProvider +from opentelemetry.sdk.metrics.export import ( + ConsoleMetricExporter, + PeriodicExportingMetricReader, +) + +temperature = 0.0 +humidity = 0.0 + + +# Function called by the gauge to read the temperature +def read_temperature(options: CallbackOptions) -> Iterable[Observation]: + global temperature + yield Observation(value=temperature, attributes={"room": "living-room"}) + + +# Function called by the gauge to read the humidity +def read_humidity(options: CallbackOptions) -> Iterable[Observation]: + global humidity + yield Observation(value=humidity, attributes={"room": "living-room"}) + + +# Use console exporter for the example +exporter = ConsoleMetricExporter() + +# The PeriodicExportingMetricReader If the time interval is set to math.inf +# the reader will not invoke periodic collection +reader = PeriodicExportingMetricReader( + exporter, + export_interval_millis=math.inf, +) + +provider = MeterProvider(metric_readers=[reader]) +set_meter_provider(provider) + +meter = get_meter_provider().get_meter("synchronous_read", "0.1.2") + +gauge = meter.create_observable_gauge( + name="synchronous_gauge_temperature", + description="Gauge value captured synchronously", + callbacks=[read_temperature], +) + +# Simulate synchronous reading of temperature +print("--- Simulating synchronous reading of temperature ---", flush=True) +temperature = 25.0 +reader.collect() +# Note: The reader will only collect the last value before `collect` is called +print("--- Last value only ---", flush=True) +temperature = 30.0 +temperature = 35.0 +reader.collect() +# Invoking `collect` will read all measurements assigned to the reader +gauge2 = meter.create_observable_gauge( + name="synchronous_gauge_humidity", + description="Gauge value captured synchronously", + callbacks=[read_humidity], +) +print("--- Multiple Measurements ---", flush=True) +temperature = 20.0 +humidity = 50.0 +reader.collect() +# Invoking `force_flush` will read all measurements assigned to the reader +print("--- Invoking force_flush ---", flush=True) +provider.force_flush() From 4def4acf6f0a5a328b2dc53d7d2bbfdd1afef836 Mon Sep 17 00:00:00 2001 From: Riccardo Magliocchetti Date: Wed, 23 Apr 2025 16:36:16 +0200 Subject: [PATCH 12/21] opentelemetry-sdk: use stable code attributes (#4508) --- CHANGELOG.md | 2 + .../sdk/_logs/_internal/__init__.py | 21 ++++++----- opentelemetry-sdk/tests/logs/test_handler.py | 37 +++++++++++-------- 3 files changed, 35 insertions(+), 25 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 256312c8c44..cc7a26bb789 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +- opentelemetry-sdk: use stable code attributes: `code.function` -> `code.function.name`, `code.lineno` -> `code.line.number`, `code.filepath` -> `code.file.path` + ([#4508](https://github.com/open-telemetry/opentelemetry-python/pull/4508)) - Fix serialization of extended attributes for logs signal ([#4342](https://github.com/open-telemetry/opentelemetry-python/pull/4342)) - docs: updated and added to the metrics and log examples diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/__init__.py index 58872f68020..9060e49aac4 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/__init__.py @@ -45,7 +45,8 @@ from opentelemetry.sdk.resources import Resource from opentelemetry.sdk.util import ns_to_iso_str from opentelemetry.sdk.util.instrumentation import InstrumentationScope -from opentelemetry.semconv.trace import SpanAttributes +from opentelemetry.semconv._incubating.attributes import code_attributes +from opentelemetry.semconv.attributes import exception_attributes from opentelemetry.trace import ( format_span_id, format_trace_id, @@ -487,22 +488,24 @@ def _get_attributes(record: logging.LogRecord) -> _ExtendedAttributes: } # Add standard code attributes for logs. - attributes[SpanAttributes.CODE_FILEPATH] = record.pathname - attributes[SpanAttributes.CODE_FUNCTION] = record.funcName - attributes[SpanAttributes.CODE_LINENO] = record.lineno + attributes[code_attributes.CODE_FILE_PATH] = record.pathname + attributes[code_attributes.CODE_FUNCTION_NAME] = record.funcName + attributes[code_attributes.CODE_LINE_NUMBER] = record.lineno if record.exc_info: exctype, value, tb = record.exc_info if exctype is not None: - attributes[SpanAttributes.EXCEPTION_TYPE] = exctype.__name__ + attributes[exception_attributes.EXCEPTION_TYPE] = ( + exctype.__name__ + ) if value is not None and value.args: - attributes[SpanAttributes.EXCEPTION_MESSAGE] = str( + attributes[exception_attributes.EXCEPTION_MESSAGE] = str( value.args[0] ) if tb is not None: - # https://github.com/open-telemetry/opentelemetry-specification/blob/9fa7c656b26647b27e485a6af7e38dc716eba98a/specification/trace/semantic_conventions/exceptions.md#stacktrace-representation - attributes[SpanAttributes.EXCEPTION_STACKTRACE] = "".join( - traceback.format_exception(*record.exc_info) + # https://opentelemetry.io/docs/specs/semconv/exceptions/exceptions-spans/#stacktrace-representation + attributes[exception_attributes.EXCEPTION_STACKTRACE] = ( + "".join(traceback.format_exception(*record.exc_info)) ) return attributes diff --git a/opentelemetry-sdk/tests/logs/test_handler.py b/opentelemetry-sdk/tests/logs/test_handler.py index 1b62cc6c788..3817c440258 100644 --- a/opentelemetry-sdk/tests/logs/test_handler.py +++ b/opentelemetry-sdk/tests/logs/test_handler.py @@ -27,7 +27,8 @@ LoggingHandler, LogRecordProcessor, ) -from opentelemetry.semconv.trace import SpanAttributes +from opentelemetry.semconv._incubating.attributes import code_attributes +from opentelemetry.semconv.attributes import exception_attributes from opentelemetry.trace import INVALID_SPAN_CONTEXT @@ -127,17 +128,19 @@ def test_log_record_user_attributes(self): self.assertEqual(len(log_record.attributes), 4) self.assertEqual(log_record.attributes["http.status_code"], 200) self.assertTrue( - log_record.attributes[SpanAttributes.CODE_FILEPATH].endswith( + log_record.attributes[code_attributes.CODE_FILE_PATH].endswith( "test_handler.py" ) ) self.assertEqual( - log_record.attributes[SpanAttributes.CODE_FUNCTION], + log_record.attributes[code_attributes.CODE_FUNCTION_NAME], "test_log_record_user_attributes", ) # The line of the log statement is not a constant (changing tests may change that), # so only check that the attribute is present. - self.assertTrue(SpanAttributes.CODE_LINENO in log_record.attributes) + self.assertTrue( + code_attributes.CODE_LINE_NUMBER in log_record.attributes + ) self.assertTrue(isinstance(log_record.attributes, BoundedAttributes)) def test_log_record_exception(self): @@ -156,15 +159,15 @@ def test_log_record_exception(self): self.assertTrue(isinstance(log_record.body, str)) self.assertEqual(log_record.body, "Zero Division Error") self.assertEqual( - log_record.attributes[SpanAttributes.EXCEPTION_TYPE], + log_record.attributes[exception_attributes.EXCEPTION_TYPE], ZeroDivisionError.__name__, ) self.assertEqual( - log_record.attributes[SpanAttributes.EXCEPTION_MESSAGE], + log_record.attributes[exception_attributes.EXCEPTION_MESSAGE], "division by zero", ) stack_trace = log_record.attributes[ - SpanAttributes.EXCEPTION_STACKTRACE + exception_attributes.EXCEPTION_STACKTRACE ] self.assertIsInstance(stack_trace, str) self.assertTrue("Traceback" in stack_trace) @@ -189,15 +192,15 @@ def test_log_record_recursive_exception(self): self.assertIsNotNone(log_record) self.assertEqual(log_record.body, "Zero Division Error") self.assertEqual( - log_record.attributes[SpanAttributes.EXCEPTION_TYPE], + log_record.attributes[exception_attributes.EXCEPTION_TYPE], ZeroDivisionError.__name__, ) self.assertEqual( - log_record.attributes[SpanAttributes.EXCEPTION_MESSAGE], + log_record.attributes[exception_attributes.EXCEPTION_MESSAGE], "division by zero", ) stack_trace = log_record.attributes[ - SpanAttributes.EXCEPTION_STACKTRACE + exception_attributes.EXCEPTION_STACKTRACE ] self.assertIsInstance(stack_trace, str) self.assertTrue("Traceback" in stack_trace) @@ -219,12 +222,14 @@ def test_log_exc_info_false(self): self.assertIsNotNone(log_record) self.assertEqual(log_record.body, "Zero Division Error") - self.assertNotIn(SpanAttributes.EXCEPTION_TYPE, log_record.attributes) self.assertNotIn( - SpanAttributes.EXCEPTION_MESSAGE, log_record.attributes + exception_attributes.EXCEPTION_TYPE, log_record.attributes + ) + self.assertNotIn( + exception_attributes.EXCEPTION_MESSAGE, log_record.attributes ) self.assertNotIn( - SpanAttributes.EXCEPTION_STACKTRACE, log_record.attributes + exception_attributes.EXCEPTION_STACKTRACE, log_record.attributes ) def test_log_record_exception_with_object_payload(self): @@ -246,15 +251,15 @@ def __str__(self): self.assertTrue(isinstance(log_record.body, str)) self.assertEqual(log_record.body, "CustomException stringified") self.assertEqual( - log_record.attributes[SpanAttributes.EXCEPTION_TYPE], + log_record.attributes[exception_attributes.EXCEPTION_TYPE], CustomException.__name__, ) self.assertEqual( - log_record.attributes[SpanAttributes.EXCEPTION_MESSAGE], + log_record.attributes[exception_attributes.EXCEPTION_MESSAGE], "CustomException message", ) stack_trace = log_record.attributes[ - SpanAttributes.EXCEPTION_STACKTRACE + exception_attributes.EXCEPTION_STACKTRACE ] self.assertIsInstance(stack_trace, str) self.assertTrue("Traceback" in stack_trace) From 211c49ebb19914e59a16721d2becab9c617f8ad9 Mon Sep 17 00:00:00 2001 From: Alex Boten <223565+codeboten@users.noreply.github.com> Date: Wed, 23 Apr 2025 13:16:09 -0700 Subject: [PATCH 13/21] bugfix(exporter): ensure response is closed (#4477) --- CHANGELOG.md | 2 ++ .../otlp/proto/http/_log_exporter/__init__.py | 29 ++++++++++++++----- .../proto/http/metric_exporter/__init__.py | 29 ++++++++++++++----- .../proto/http/trace_exporter/__init__.py | 29 ++++++++++++++----- 4 files changed, 68 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cc7a26bb789..f379f34e0de 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +- Fix intermittent `Connection aborted` error when using otlp/http exporters + ([#4477](https://github.com/open-telemetry/opentelemetry-python/pull/4477)) - opentelemetry-sdk: use stable code attributes: `code.function` -> `code.function.name`, `code.lineno` -> `code.line.number`, `code.filepath` -> `code.file.path` ([#4508](https://github.com/open-telemetry/opentelemetry-python/pull/4508)) - Fix serialization of extended attributes for logs signal diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py index 4662c8e4d55..1a46cab057c 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py @@ -21,6 +21,7 @@ from typing import Dict, Optional, Sequence import requests +from requests.exceptions import ConnectionError from opentelemetry.exporter.otlp.proto.common._log_encoder import encode_logs from opentelemetry.exporter.otlp.proto.http import ( @@ -128,13 +129,27 @@ def _export(self, serialized_data: bytes, timeout_sec: float): elif self._compression == Compression.Deflate: data = zlib.compress(serialized_data) - return self._session.post( - url=self._endpoint, - data=data, - verify=self._certificate_file, - timeout=timeout_sec, - cert=self._client_cert, - ) + # By default, keep-alive is enabled in Session's request + # headers. Backends may choose to close the connection + # while a post happens which causes an unhandled + # exception. This try/except will retry the post on such exceptions + try: + resp = self._session.post( + url=self._endpoint, + data=data, + verify=self._certificate_file, + timeout=timeout_sec + cert=self._client_cert, + ) + except ConnectionError: + resp = self._session.post( + url=self._endpoint, + data=data, + verify=self._certificate_file, + timeout=timeout_sec + cert=self._client_cert, + ) + return resp @staticmethod def _retryable(resp: requests.Response) -> bool: diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py index 46e7d3b84b6..6c8b930fbc7 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py @@ -30,6 +30,7 @@ import requests from deprecated import deprecated +from requests.exceptions import ConnectionError from opentelemetry.exporter.otlp.proto.common._internal import ( _get_resource_data, @@ -173,13 +174,27 @@ def _export(self, serialized_data: bytes, timeout_sec: float): elif self._compression == Compression.Deflate: data = zlib.compress(serialized_data) - return self._session.post( - url=self._endpoint, - data=data, - verify=self._certificate_file, - timeout=timeout_sec, - cert=self._client_cert, - ) + # By default, keep-alive is enabled in Session's request + # headers. Backends may choose to close the connection + # while a post happens which causes an unhandled + # exception. This try/except will retry the post on such exceptions + try: + resp = self._session.post( + url=self._endpoint, + data=data, + verify=self._certificate_file, + timeout=timeout_sec, + cert=self._client_cert, + ) + except ConnectionError: + resp = self._session.post( + url=self._endpoint, + data=data, + verify=self._certificate_file, + timeout=timeout_sec, + cert=self._client_cert, + ) + return resp @staticmethod def _retryable(resp: requests.Response) -> bool: diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py index 0c913df0e88..6fa0f1c1bdd 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py @@ -21,6 +21,7 @@ from typing import Dict, Optional, Sequence import requests +from requests.exceptions import ConnectionError from opentelemetry.exporter.otlp.proto.common.trace_encoder import ( encode_spans, @@ -126,13 +127,27 @@ def _export(self, serialized_data: bytes, timeout_sec: float): elif self._compression == Compression.Deflate: data = zlib.compress(serialized_data) - return self._session.post( - url=self._endpoint, - data=data, - verify=self._certificate_file, - timeout=timeout_sec, - cert=self._client_cert, - ) + # By default, keep-alive is enabled in Session's request + # headers. Backends may choose to close the connection + # while a post happens which causes an unhandled + # exception. This try/except will retry the post on such exceptions + try: + resp = self._session.post( + url=self._endpoint, + data=data, + verify=self._certificate_file, + timeout=timeout_sec, + cert=self._client_cert, + ) + except ConnectionError: + resp = self._session.post( + url=self._endpoint, + data=data, + verify=self._certificate_file, + timeout=timeout_sec, + cert=self._client_cert, + ) + return resp @staticmethod def _retryable(resp: requests.Response) -> bool: From 9ee687242257d5dcb62f257145f5f84314b77138 Mon Sep 17 00:00:00 2001 From: DylanRussell Date: Thu, 24 Apr 2025 12:50:50 -0400 Subject: [PATCH 14/21] Refactor BatchLogRecordProcessor and associated tests (#4535) --- .../sdk/_logs/_internal/export/__init__.py | 20 ++++--- .../sdk/environment_variables/__init__.py | 1 + opentelemetry-sdk/tests/logs/test_export.py | 58 ++++++++++++++----- 3 files changed, 56 insertions(+), 23 deletions(-) diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py index 39452d4cbc1..223e27d6af7 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py @@ -210,6 +210,7 @@ def __init__( self._schedule_delay = schedule_delay_millis / 1e3 self._max_export_batch_size = max_export_batch_size # Not used. No way currently to pass timeout to export. + # TODO(https://github.com/open-telemetry/opentelemetry-python/issues/4555): figure out what this should do. self._export_timeout_millis = export_timeout_millis # Deque is thread safe. self._queue = collections.deque([], max_queue_size) @@ -218,9 +219,10 @@ def __init__( target=self.worker, daemon=True, ) + self._shutdown = False self._export_lock = threading.Lock() - self._worker_sleep = threading.Event() + self._worker_awaken = threading.Event() self._worker_thread.start() if hasattr(os, "register_at_fork"): weak_reinit = weakref.WeakMethod(self._at_fork_reinit) @@ -235,15 +237,15 @@ def _should_export_batch( # Always continue to export while queue length exceeds max batch size. if len(self._queue) >= self._max_export_batch_size: return True - if batch_strategy == BatchLogExportStrategy.EXPORT_ALL: + if batch_strategy is BatchLogExportStrategy.EXPORT_ALL: return True - if batch_strategy == BatchLogExportStrategy.EXPORT_AT_LEAST_ONE_BATCH: + if batch_strategy is BatchLogExportStrategy.EXPORT_AT_LEAST_ONE_BATCH: return num_iterations == 0 return False def _at_fork_reinit(self): self._export_lock = threading.Lock() - self._worker_sleep = threading.Event() + self._worker_awaken = threading.Event() self._queue.clear() self._worker_thread = threading.Thread( name="OtelBatchLogRecordProcessor", @@ -258,7 +260,7 @@ def worker(self): # Lots of strategies in the spec for setting next timeout. # https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/sdk.md#batching-processor. # Shutdown will interrupt this sleep. Emit will interrupt this sleep only if the queue is bigger then threshold. - sleep_interrupted = self._worker_sleep.wait(self._schedule_delay) + sleep_interrupted = self._worker_awaken.wait(self._schedule_delay) if self._shutdown: break self._export( @@ -266,7 +268,7 @@ def worker(self): if sleep_interrupted else BatchLogExportStrategy.EXPORT_AT_LEAST_ONE_BATCH ) - self._worker_sleep.clear() + self._worker_awaken.clear() self._export(BatchLogExportStrategy.EXPORT_ALL) def _export(self, batch_strategy: BatchLogExportStrategy) -> None: @@ -296,7 +298,7 @@ def _export(self, batch_strategy: BatchLogExportStrategy) -> None: def emit(self, log_data: LogData) -> None: if self._shutdown: - _logger.warning("Shutdown called, ignoring log.") + _logger.info("Shutdown called, ignoring log.") return if self._pid != os.getpid(): _BSP_RESET_ONCE.do_once(self._at_fork_reinit) @@ -305,7 +307,7 @@ def emit(self, log_data: LogData) -> None: _logger.warning("Queue full, dropping log.") self._queue.appendleft(log_data) if len(self._queue) >= self._max_export_batch_size: - self._worker_sleep.set() + self._worker_awaken.set() def shutdown(self): if self._shutdown: @@ -313,7 +315,7 @@ def shutdown(self): # Prevents emit and force_flush from further calling export. self._shutdown = True # Interrupts sleep in the worker, if it's sleeping. - self._worker_sleep.set() + self._worker_awaken.set() # Main worker loop should exit after one final export call with flush all strategy. self._worker_thread.join() self._exporter.shutdown() diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/environment_variables/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/environment_variables/__init__.py index e6e91e84c6a..038cb95a78b 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/environment_variables/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/environment_variables/__init__.py @@ -87,6 +87,7 @@ .. envvar:: OTEL_BLRP_EXPORT_TIMEOUT The :envvar:`OTEL_BLRP_EXPORT_TIMEOUT` represents the maximum allowed time to export data from the BatchLogRecordProcessor. +This environment variable currently does nothing, see https://github.com/open-telemetry/opentelemetry-python/issues/4555. Default: 30000 """ diff --git a/opentelemetry-sdk/tests/logs/test_export.py b/opentelemetry-sdk/tests/logs/test_export.py index 3d3c8de41e9..6640246f4bc 100644 --- a/opentelemetry-sdk/tests/logs/test_export.py +++ b/opentelemetry-sdk/tests/logs/test_export.py @@ -486,24 +486,20 @@ def test_logs_exported_once_batch_size_reached(self): exporter.export.assert_called_once() after_export = time.time_ns() # Shows the worker's 30 second sleep was interrupted within a second. - self.assertTrue((after_export - before_export) < 1e9) + self.assertLess(after_export - before_export, 1e9) # pylint: disable=no-self-use def test_logs_exported_once_schedule_delay_reached(self): exporter = Mock() log_record_processor = BatchLogRecordProcessor( exporter=exporter, - # Should not reach this during the test, instead export should be called when delay millis is hit. max_queue_size=15, max_export_batch_size=15, schedule_delay_millis=100, ) - for _ in range(15): - log_record_processor.emit(EMPTY_LOG) - time.sleep(0.11) - exporter.export.assert_has_calls( - [call([EMPTY_LOG]) for _ in range(15)] - ) + log_record_processor.emit(EMPTY_LOG) + time.sleep(0.2) + exporter.export.assert_called_once_with([EMPTY_LOG]) def test_logs_flushed_before_shutdown_and_dropped_after_shutdown(self): exporter = Mock() @@ -517,15 +513,16 @@ def test_logs_flushed_before_shutdown_and_dropped_after_shutdown(self): # This log should be flushed because it was written before shutdown. log_record_processor.emit(EMPTY_LOG) log_record_processor.shutdown() + exporter.export.assert_called_once_with([EMPTY_LOG]) self.assertTrue(exporter._stopped) - with self.assertLogs(level="WARNING") as log: + with self.assertLogs(level="INFO") as log: # This log should not be flushed. log_record_processor.emit(EMPTY_LOG) self.assertEqual(len(log.output), 1) self.assertEqual(len(log.records), 1) self.assertIn("Shutdown called, ignoring log.", log.output[0]) - exporter.export.assert_called_once_with([EMPTY_LOG]) + exporter.export.assert_called_once() # pylint: disable=no-self-use def test_force_flush_flushes_logs(self): @@ -554,6 +551,7 @@ def bulk_log_and_flush(num_logs): with ThreadPoolExecutor(max_workers=69) as executor: for idx in range(69): executor.submit(bulk_log_and_flush, idx + 1) + executor.shutdown() finished_logs = exporter.get_finished_logs() @@ -563,21 +561,53 @@ def bulk_log_and_flush(num_logs): hasattr(os, "fork"), "needs *nix", ) - def test_batch_log_record_processor_fork(self): + def test_batch_log_record_processor_fork_clears_logs_from_child(self): exporter = InMemoryLogExporter() log_record_processor = BatchLogRecordProcessor( exporter, max_export_batch_size=64, schedule_delay_millis=30000, ) - # These are not expected to be flushed. Calling fork clears any logs not flushed. + # These logs should be flushed only from the parent process. + # _at_fork_reinit should be called in the child process, to + # clear these logs in the child process. for _ in range(10): log_record_processor.emit(EMPTY_LOG) + + # The below test also needs this, but it can only be set once. multiprocessing.set_start_method("fork") def child(conn): - for _ in range(100): + log_record_processor.force_flush() + logs = exporter.get_finished_logs() + conn.send(len(logs) == 0) + conn.close() + + parent_conn, child_conn = multiprocessing.Pipe() + process = multiprocessing.Process(target=child, args=(child_conn,)) + process.start() + self.assertTrue(parent_conn.recv()) + process.join() + log_record_processor.force_flush() + self.assertTrue(len(exporter.get_finished_logs()) == 10) + + @unittest.skipUnless( + hasattr(os, "fork"), + "needs *nix", + ) + def test_batch_log_record_processor_fork_doesnot_deadlock(self): + exporter = InMemoryLogExporter() + log_record_processor = BatchLogRecordProcessor( + exporter, + max_export_batch_size=64, + schedule_delay_millis=30000, + ) + + def child(conn): + def _target(): log_record_processor.emit(EMPTY_LOG) + + ConcurrencyTestBase.run_with_many_threads(_target, 100) log_record_processor.force_flush() logs = exporter.get_finished_logs() conn.send(len(logs) == 100) @@ -588,7 +618,6 @@ def child(conn): process.start() self.assertTrue(parent_conn.recv()) process.join() - self.assertTrue(len(exporter.get_finished_logs()) == 0) def test_batch_log_record_processor_gc(self): # Given a BatchLogRecordProcessor @@ -650,4 +679,5 @@ def formatter(record): # pylint: disable=unused-argument mock_stdout = Mock() exporter = ConsoleLogExporter(out=mock_stdout, formatter=formatter) exporter.export([EMPTY_LOG]) + mock_stdout.write.assert_called_once_with(mock_record_str) From e6ac3528a1f6b6ebd444668930a19c0193887a9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Em=C3=ADdio=20Neto?= <9735060+emdneto@users.noreply.github.com> Date: Mon, 28 Apr 2025 11:54:15 -0300 Subject: [PATCH 15/21] infra: Automate SHA procedure during releases (#4547) * trying sha-automation Signed-off-by: emdneto <9735060+emdneto@users.noreply.github.com> * fix label names * fix sha-automation core Signed-off-by: emdneto <9735060+emdneto@users.noreply.github.com> * add new line Signed-off-by: emdneto <9735060+emdneto@users.noreply.github.com> --------- Signed-off-by: emdneto <9735060+emdneto@users.noreply.github.com> --- .github/workflows/contrib.yml | 6 ++++- .github/workflows/lint_0.yml | 10 +++++++- .github/workflows/misc_0.yml | 10 +++++++- .github/workflows/prepare-patch-release.yml | 13 ++++++++-- .github/workflows/prepare-release-branch.yml | 26 +++++++++++++++++--- .github/workflows/templates/lint.yml.j2 | 10 +++++++- .github/workflows/templates/misc.yml.j2 | 10 +++++++- .github/workflows/templates/test.yml.j2 | 10 +++++++- .github/workflows/test_0.yml | 10 +++++++- .github/workflows/test_1.yml | 10 +++++++- 10 files changed, 101 insertions(+), 14 deletions(-) diff --git a/.github/workflows/contrib.yml b/.github/workflows/contrib.yml index d59a452239b..395f3b31a49 100644 --- a/.github/workflows/contrib.yml +++ b/.github/workflows/contrib.yml @@ -15,4 +15,8 @@ jobs: uses: open-telemetry/opentelemetry-python-contrib/.github/workflows/core_contrib_test_0.yml@main with: CORE_REPO_SHA: ${{ github.sha }} - CONTRIB_REPO_SHA: main + CONTRIB_REPO_SHA: ${{ github.event_name == 'pull_request' && ( + contains(github.event.pull_request.labels.*.name, 'prepare-release') && github.event.pull_request.head.ref || + contains(github.event.pull_request.labels.*.name, 'backport') && github.event.pull_request.base.ref || + 'main' + ) || 'main' }} diff --git a/.github/workflows/lint_0.yml b/.github/workflows/lint_0.yml index e06b0b65fce..77320068972 100644 --- a/.github/workflows/lint_0.yml +++ b/.github/workflows/lint_0.yml @@ -15,7 +15,15 @@ concurrency: env: CORE_REPO_SHA: main - CONTRIB_REPO_SHA: main + # Set the SHA to the branch name if the PR has a label 'prepare-release' or 'backport' otherwise, set it to 'main' + # For PRs you can change the inner fallback ('main') + # For pushes you change the outer fallback ('main') + # The logic below is used during releases and depends on having an equivalent branch name in the contrib repo. + CONTRIB_REPO_SHA: ${{ github.event_name == 'pull_request' && ( + contains(github.event.pull_request.labels.*.name, 'prepare-release') && github.event.pull_request.head.ref || + contains(github.event.pull_request.labels.*.name, 'backport') && github.event.pull_request.base.ref || + 'main' + ) || 'main' }} PIP_EXISTS_ACTION: w jobs: diff --git a/.github/workflows/misc_0.yml b/.github/workflows/misc_0.yml index 0b7999d3bd6..1497bbe8c45 100644 --- a/.github/workflows/misc_0.yml +++ b/.github/workflows/misc_0.yml @@ -15,7 +15,15 @@ concurrency: env: CORE_REPO_SHA: main - CONTRIB_REPO_SHA: main + # Set the SHA to the branch name if the PR has a label 'prepare-release' or 'backport' otherwise, set it to 'main' + # For PRs you can change the inner fallback ('main') + # For pushes you change the outer fallback ('main') + # The logic below is used during releases and depends on having an equivalent branch name in the contrib repo. + CONTRIB_REPO_SHA: ${{ github.event_name == 'pull_request' && ( + contains(github.event.pull_request.labels.*.name, 'prepare-release') && github.event.pull_request.head.ref || + contains(github.event.pull_request.labels.*.name, 'backport') && github.event.pull_request.base.ref || + 'main' + ) || 'main' }} PIP_EXISTS_ACTION: w jobs: diff --git a/.github/workflows/prepare-patch-release.yml b/.github/workflows/prepare-patch-release.yml index e37b78afae3..680b3842b99 100644 --- a/.github/workflows/prepare-patch-release.yml +++ b/.github/workflows/prepare-patch-release.yml @@ -65,6 +65,7 @@ jobs: run: .github/scripts/use-cla-approved-github-bot.sh - name: Create pull request + id: create_pr env: # not using secrets.GITHUB_TOKEN since pull requests from that token do not run workflows GITHUB_TOKEN: ${{ secrets.OPENTELEMETRYBOT_GITHUB_TOKEN }} @@ -74,7 +75,15 @@ jobs: git commit -a -m "$message" git push origin HEAD:$branch - gh pr create --title "[$GITHUB_REF_NAME] $message" \ + pr_url=$(gh pr create --title "[$GITHUB_REF_NAME] $message" \ --body "$message." \ --head $branch \ - --base $GITHUB_REF_NAME + --base $GITHUB_REF_NAME) + + echo "pr_url=$pr_url" >> $GITHUB_OUTPUT + - name: Add prepare-release label to PR + if: steps.create_pr.outputs.pr_url != '' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh pr edit ${{ steps.create_pr.outputs.pr_url }} --add-label "prepare-release" diff --git a/.github/workflows/prepare-release-branch.yml b/.github/workflows/prepare-release-branch.yml index 18bad26bfbb..edb906ed16c 100644 --- a/.github/workflows/prepare-release-branch.yml +++ b/.github/workflows/prepare-release-branch.yml @@ -91,6 +91,7 @@ jobs: run: .github/scripts/use-cla-approved-github-bot.sh - name: Create pull request against the release branch + id: create_release_branch_pr env: # not using secrets.GITHUB_TOKEN since pull requests from that token do not run workflows GITHUB_TOKEN: ${{ secrets.OPENTELEMETRYBOT_GITHUB_TOKEN }} @@ -100,10 +101,18 @@ jobs: git commit -a -m "$message" git push origin HEAD:$branch - gh pr create --title "[$RELEASE_BRANCH_NAME] $message" \ + pr_url=$(gh pr create --title "[$RELEASE_BRANCH_NAME] $message" \ --body "$message." \ --head $branch \ - --base $RELEASE_BRANCH_NAME + --base $RELEASE_BRANCH_NAME) + echo "pr_url=$pr_url" >> $GITHUB_OUTPUT + + - name: Add prepare-release label to PR + if: steps.create_release_branch_pr.outputs.pr_url != '' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh pr edit ${{ steps.create_release_branch_pr.outputs.pr_url }} --add-label "prepare-release" create-pull-request-against-main: runs-on: ubuntu-latest @@ -170,6 +179,7 @@ jobs: run: .github/scripts/use-cla-approved-github-bot.sh - name: Create pull request against main + id: create_main_pr env: # not using secrets.GITHUB_TOKEN since pull requests from that token do not run workflows GITHUB_TOKEN: ${{ secrets.OPENTELEMETRYBOT_GITHUB_TOKEN }} @@ -180,7 +190,15 @@ jobs: git commit -a -m "$message" git push origin HEAD:$branch - gh pr create --title "$message" \ + pr_url=$(gh pr create --title "$message" \ --body "$body" \ --head $branch \ - --base main + --base main) + echo "pr_url=$pr_url" >> $GITHUB_OUTPUT + + - name: Add prepare-release label to PR + if: steps.create_main_pr.outputs.pr_url != '' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh pr edit ${{ steps.create_main_pr.outputs.pr_url }} --add-label "prepare-release" diff --git a/.github/workflows/templates/lint.yml.j2 b/.github/workflows/templates/lint.yml.j2 index e373be8d69e..169f8f61dc4 100644 --- a/.github/workflows/templates/lint.yml.j2 +++ b/.github/workflows/templates/lint.yml.j2 @@ -15,7 +15,15 @@ concurrency: env: CORE_REPO_SHA: main - CONTRIB_REPO_SHA: main + # Set the SHA to the branch name if the PR has a label 'prepare-release' or 'backport' otherwise, set it to 'main' + # For PRs you can change the inner fallback ('main') + # For pushes you change the outer fallback ('main') + # The logic below is used during releases and depends on having an equivalent branch name in the contrib repo. + CONTRIB_REPO_SHA: {% raw %}${{ github.event_name == 'pull_request' && ( + contains(github.event.pull_request.labels.*.name, 'prepare-release') && github.event.pull_request.head.ref || + contains(github.event.pull_request.labels.*.name, 'backport') && github.event.pull_request.base.ref || + 'main' + ) || 'main' }}{% endraw %} PIP_EXISTS_ACTION: w jobs: diff --git a/.github/workflows/templates/misc.yml.j2 b/.github/workflows/templates/misc.yml.j2 index 1cd3c27a42d..d2f5fed3b83 100644 --- a/.github/workflows/templates/misc.yml.j2 +++ b/.github/workflows/templates/misc.yml.j2 @@ -15,7 +15,15 @@ concurrency: env: CORE_REPO_SHA: main - CONTRIB_REPO_SHA: main + # Set the SHA to the branch name if the PR has a label 'prepare-release' or 'backport' otherwise, set it to 'main' + # For PRs you can change the inner fallback ('main') + # For pushes you change the outer fallback ('main') + # The logic below is used during releases and depends on having an equivalent branch name in the contrib repo. + CONTRIB_REPO_SHA: {% raw %}${{ github.event_name == 'pull_request' && ( + contains(github.event.pull_request.labels.*.name, 'prepare-release') && github.event.pull_request.head.ref || + contains(github.event.pull_request.labels.*.name, 'backport') && github.event.pull_request.base.ref || + 'main' + ) || 'main' }}{% endraw %} PIP_EXISTS_ACTION: w jobs: diff --git a/.github/workflows/templates/test.yml.j2 b/.github/workflows/templates/test.yml.j2 index efd9e311224..8e8338b6237 100644 --- a/.github/workflows/templates/test.yml.j2 +++ b/.github/workflows/templates/test.yml.j2 @@ -15,7 +15,15 @@ concurrency: env: CORE_REPO_SHA: main - CONTRIB_REPO_SHA: main + # Set the SHA to the branch name if the PR has a label 'prepare-release' or 'backport' otherwise, set it to 'main' + # For PRs you can change the inner fallback ('main') + # For pushes you change the outer fallback ('main') + # The logic below is used during releases and depends on having an equivalent branch name in the contrib repo. + CONTRIB_REPO_SHA: {% raw %}${{ github.event_name == 'pull_request' && ( + contains(github.event.pull_request.labels.*.name, 'prepare-release') && github.event.pull_request.head.ref || + contains(github.event.pull_request.labels.*.name, 'backport') && github.event.pull_request.base.ref || + 'main' + ) || 'main' }}{% endraw %} PIP_EXISTS_ACTION: w jobs: diff --git a/.github/workflows/test_0.yml b/.github/workflows/test_0.yml index 2b33a23c476..dcfec0fa0a7 100644 --- a/.github/workflows/test_0.yml +++ b/.github/workflows/test_0.yml @@ -15,7 +15,15 @@ concurrency: env: CORE_REPO_SHA: main - CONTRIB_REPO_SHA: main + # Set the SHA to the branch name if the PR has a label 'prepare-release' or 'backport' otherwise, set it to 'main' + # For PRs you can change the inner fallback ('main') + # For pushes you change the outer fallback ('main') + # The logic below is used during releases and depends on having an equivalent branch name in the contrib repo. + CONTRIB_REPO_SHA: ${{ github.event_name == 'pull_request' && ( + contains(github.event.pull_request.labels.*.name, 'prepare-release') && github.event.pull_request.head.ref || + contains(github.event.pull_request.labels.*.name, 'backport') && github.event.pull_request.base.ref || + 'main' + ) || 'main' }} PIP_EXISTS_ACTION: w jobs: diff --git a/.github/workflows/test_1.yml b/.github/workflows/test_1.yml index de5a446d3f6..b3d5a75ff4a 100644 --- a/.github/workflows/test_1.yml +++ b/.github/workflows/test_1.yml @@ -15,7 +15,15 @@ concurrency: env: CORE_REPO_SHA: main - CONTRIB_REPO_SHA: main + # Set the SHA to the branch name if the PR has a label 'prepare-release' or 'backport' otherwise, set it to 'main' + # For PRs you can change the inner fallback ('main') + # For pushes you change the outer fallback ('main') + # The logic below is used during releases and depends on having an equivalent branch name in the contrib repo. + CONTRIB_REPO_SHA: ${{ github.event_name == 'pull_request' && ( + contains(github.event.pull_request.labels.*.name, 'prepare-release') && github.event.pull_request.head.ref || + contains(github.event.pull_request.labels.*.name, 'backport') && github.event.pull_request.base.ref || + 'main' + ) || 'main' }} PIP_EXISTS_ACTION: w jobs: From 9f9c3b17d6a54f8f5750b1a045dfc0322757c266 Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Wed, 9 Apr 2025 18:35:31 +0000 Subject: [PATCH 16/21] Refactor BatchLogRecordProcessor --- .../sdk/_logs/_internal/export/__init__.py | 55 +++++++++++++++---- opentelemetry-sdk/tests/logs/test_export.py | 48 ++++++++++------ 2 files changed, 76 insertions(+), 27 deletions(-) diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py index 223e27d6af7..d6a53b805f1 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py @@ -197,6 +197,7 @@ def __init__( BatchLogRecordProcessor._default_max_export_batch_size() ) # Not used. No way currently to pass timeout to export. + # Not used. No way currently to pass timeout to export. if export_timeout_millis is None: export_timeout_millis = ( BatchLogRecordProcessor._default_export_timeout_millis() @@ -208,21 +209,23 @@ def __init__( self._exporter = exporter self._max_queue_size = max_queue_size self._schedule_delay = schedule_delay_millis / 1e3 + self._schedule_delay = schedule_delay_millis / 1e3 self._max_export_batch_size = max_export_batch_size # Not used. No way currently to pass timeout to export. # TODO(https://github.com/open-telemetry/opentelemetry-python/issues/4555): figure out what this should do. + # Not used. No way currently to pass timeout to export. self._export_timeout_millis = export_timeout_millis # Deque is thread safe. + # Deque is thread safe. self._queue = collections.deque([], max_queue_size) self._worker_thread = threading.Thread( name="OtelBatchLogRecordProcessor", target=self.worker, daemon=True, ) - self._shutdown = False self._export_lock = threading.Lock() - self._worker_awaken = threading.Event() + self._worker_sleep = threading.Event() self._worker_thread.start() if hasattr(os, "register_at_fork"): weak_reinit = weakref.WeakMethod(self._at_fork_reinit) @@ -237,15 +240,15 @@ def _should_export_batch( # Always continue to export while queue length exceeds max batch size. if len(self._queue) >= self._max_export_batch_size: return True - if batch_strategy is BatchLogExportStrategy.EXPORT_ALL: + if batch_strategy == BatchLogExportStrategy.EXPORT_ALL: return True - if batch_strategy is BatchLogExportStrategy.EXPORT_AT_LEAST_ONE_BATCH: + if batch_strategy == BatchLogExportStrategy.EXPORT_AT_LEAST_ONE_BATCH: return num_iterations == 0 return False def _at_fork_reinit(self): self._export_lock = threading.Lock() - self._worker_awaken = threading.Event() + self._worker_sleep = threading.Event() self._queue.clear() self._worker_thread = threading.Thread( name="OtelBatchLogRecordProcessor", @@ -260,7 +263,7 @@ def worker(self): # Lots of strategies in the spec for setting next timeout. # https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/sdk.md#batching-processor. # Shutdown will interrupt this sleep. Emit will interrupt this sleep only if the queue is bigger then threshold. - sleep_interrupted = self._worker_awaken.wait(self._schedule_delay) + sleep_interrupted = self._worker_sleep.wait(self._schedule_delay) if self._shutdown: break self._export( @@ -268,9 +271,33 @@ def worker(self): if sleep_interrupted else BatchLogExportStrategy.EXPORT_AT_LEAST_ONE_BATCH ) - self._worker_awaken.clear() + self._worker_sleep.clear() self._export(BatchLogExportStrategy.EXPORT_ALL) + def _export(self, batch_strategy: BatchLogExportStrategy) -> None: + with self._export_lock: + iteration = 0 + # We could see concurrent export calls from worker and force_flush. We call _should_export_batch + # once the lock is obtained to see if we still need to make the requested export. + while self._should_export_batch(batch_strategy, iteration): + iteration += 1 + token = attach(set_value(_SUPPRESS_INSTRUMENTATION_KEY, True)) + try: + self._exporter.export( + [ + # Oldest records are at the back, so pop from there. + self._queue.pop() + for _ in range( + min( + self._max_export_batch_size, + len(self._queue), + ) + ) + ] + ) + except Exception: # pylint: disable=broad-exception-caught + _logger.exception("Exception while exporting logs.") + detach(token) def _export(self, batch_strategy: BatchLogExportStrategy) -> None: with self._export_lock: iteration = 0 @@ -298,24 +325,29 @@ def _export(self, batch_strategy: BatchLogExportStrategy) -> None: def emit(self, log_data: LogData) -> None: if self._shutdown: - _logger.info("Shutdown called, ignoring log.") + _logger.warning("Shutdown called, ignoring log.") return if self._pid != os.getpid(): _BSP_RESET_ONCE.do_once(self._at_fork_reinit) + if len(self._queue) == self._max_queue_size: + _logger.warning("Queue full, dropping log.") if len(self._queue) == self._max_queue_size: _logger.warning("Queue full, dropping log.") self._queue.appendleft(log_data) if len(self._queue) >= self._max_export_batch_size: - self._worker_awaken.set() + self._worker_sleep.set() def shutdown(self): + if self._shutdown: + return + # Prevents emit and force_flush from further calling export. if self._shutdown: return # Prevents emit and force_flush from further calling export. self._shutdown = True # Interrupts sleep in the worker, if it's sleeping. - self._worker_awaken.set() + self._worker_sleep.set() # Main worker loop should exit after one final export call with flush all strategy. self._worker_thread.join() self._exporter.shutdown() @@ -325,6 +357,9 @@ def force_flush(self, timeout_millis: Optional[int] = None) -> bool: return # Blocking call to export. self._export(BatchLogExportStrategy.EXPORT_ALL) + return + # Blocking call to export. + self._export(BatchLogExportStrategy.EXPORT_ALL) @staticmethod def _default_max_queue_size(): diff --git a/opentelemetry-sdk/tests/logs/test_export.py b/opentelemetry-sdk/tests/logs/test_export.py index 6640246f4bc..dc184ba5485 100644 --- a/opentelemetry-sdk/tests/logs/test_export.py +++ b/opentelemetry-sdk/tests/logs/test_export.py @@ -54,6 +54,11 @@ instrumentation_scope=InstrumentationScope("example", "example"), ) +EMPTY_LOG = LogData( + log_record=LogRecord(), + instrumentation_scope=InstrumentationScope("example", "example"), +) + class TestSimpleLogRecordProcessor(unittest.TestCase): def test_simple_log_record_processor_default_level(self): @@ -332,6 +337,7 @@ def test_simple_log_record_processor_different_msg_types_with_formatter( self.assertEqual(expected, emitted) +class TestBatchLogRecordProcessor(unittest.TestCase): class TestBatchLogRecordProcessor(unittest.TestCase): def test_emit_call_log_record(self): exporter = InMemoryLogExporter() @@ -358,6 +364,7 @@ def test_args(self): self.assertEqual(log_record_processor._exporter, exporter) self.assertEqual(log_record_processor._max_queue_size, 1024) self.assertEqual(log_record_processor._schedule_delay, 2.5) + self.assertEqual(log_record_processor._schedule_delay, 2.5) self.assertEqual(log_record_processor._max_export_batch_size, 256) self.assertEqual(log_record_processor._export_timeout_millis, 15000) @@ -376,6 +383,7 @@ def test_env_vars(self): self.assertEqual(log_record_processor._exporter, exporter) self.assertEqual(log_record_processor._max_queue_size, 1024) self.assertEqual(log_record_processor._schedule_delay, 2.5) + self.assertEqual(log_record_processor._schedule_delay, 2.5) self.assertEqual(log_record_processor._max_export_batch_size, 256) self.assertEqual(log_record_processor._export_timeout_millis, 15000) @@ -385,6 +393,7 @@ def test_args_defaults(self): self.assertEqual(log_record_processor._exporter, exporter) self.assertEqual(log_record_processor._max_queue_size, 2048) self.assertEqual(log_record_processor._schedule_delay, 5) + self.assertEqual(log_record_processor._schedule_delay, 5) self.assertEqual(log_record_processor._max_export_batch_size, 512) self.assertEqual(log_record_processor._export_timeout_millis, 30000) @@ -405,6 +414,7 @@ def test_args_env_var_value_error(self): self.assertEqual(log_record_processor._exporter, exporter) self.assertEqual(log_record_processor._max_queue_size, 2048) self.assertEqual(log_record_processor._schedule_delay, 5) + self.assertEqual(log_record_processor._schedule_delay, 5) self.assertEqual(log_record_processor._max_export_batch_size, 512) self.assertEqual(log_record_processor._export_timeout_millis, 30000) @@ -420,6 +430,7 @@ def test_args_none_defaults(self): self.assertEqual(log_record_processor._exporter, exporter) self.assertEqual(log_record_processor._max_queue_size, 2048) self.assertEqual(log_record_processor._schedule_delay, 5) + self.assertEqual(log_record_processor._schedule_delay, 5) self.assertEqual(log_record_processor._max_export_batch_size, 512) self.assertEqual(log_record_processor._export_timeout_millis, 30000) @@ -486,20 +497,24 @@ def test_logs_exported_once_batch_size_reached(self): exporter.export.assert_called_once() after_export = time.time_ns() # Shows the worker's 30 second sleep was interrupted within a second. - self.assertLess(after_export - before_export, 1e9) + self.assertTrue((after_export - before_export) < 1e9) # pylint: disable=no-self-use def test_logs_exported_once_schedule_delay_reached(self): exporter = Mock() log_record_processor = BatchLogRecordProcessor( exporter=exporter, + # Should not reach this during the test, instead export should be called when delay millis is hit. max_queue_size=15, max_export_batch_size=15, schedule_delay_millis=100, ) - log_record_processor.emit(EMPTY_LOG) - time.sleep(0.2) - exporter.export.assert_called_once_with([EMPTY_LOG]) + for _ in range(15): + log_record_processor.emit(EMPTY_LOG) + time.sleep(0.11) + exporter.export.assert_has_calls( + [call([EMPTY_LOG]) for _ in range(15)] + ) def test_logs_flushed_before_shutdown_and_dropped_after_shutdown(self): exporter = Mock() @@ -516,13 +531,13 @@ def test_logs_flushed_before_shutdown_and_dropped_after_shutdown(self): exporter.export.assert_called_once_with([EMPTY_LOG]) self.assertTrue(exporter._stopped) - with self.assertLogs(level="INFO") as log: + with self.assertLogs(level="WARNING") as log: # This log should not be flushed. log_record_processor.emit(EMPTY_LOG) self.assertEqual(len(log.output), 1) self.assertEqual(len(log.records), 1) self.assertIn("Shutdown called, ignoring log.", log.output[0]) - exporter.export.assert_called_once() + exporter.export.assert_called_once_with([EMPTY_LOG]) # pylint: disable=no-self-use def test_force_flush_flushes_logs(self): @@ -538,6 +553,7 @@ def test_force_flush_flushes_logs(self): log_record_processor.emit(EMPTY_LOG) log_record_processor.force_flush() exporter.export.assert_called_once_with([EMPTY_LOG for _ in range(10)]) + exporter.export.assert_called_once_with([EMPTY_LOG for _ in range(10)]) def test_with_multiple_threads(self): exporter = InMemoryLogExporter() @@ -547,11 +563,12 @@ def bulk_log_and_flush(num_logs): for _ in range(num_logs): log_record_processor.emit(EMPTY_LOG) log_record_processor.force_flush() + log_record_processor.emit(EMPTY_LOG) + log_record_processor.force_flush() with ThreadPoolExecutor(max_workers=69) as executor: for idx in range(69): executor.submit(bulk_log_and_flush, idx + 1) - executor.shutdown() finished_logs = exporter.get_finished_logs() @@ -561,20 +578,17 @@ def bulk_log_and_flush(num_logs): hasattr(os, "fork"), "needs *nix", ) - def test_batch_log_record_processor_fork_clears_logs_from_child(self): + def test_batch_log_record_processor_fork(self): exporter = InMemoryLogExporter() log_record_processor = BatchLogRecordProcessor( exporter, max_export_batch_size=64, schedule_delay_millis=30000, + schedule_delay_millis=30000, ) - # These logs should be flushed only from the parent process. - # _at_fork_reinit should be called in the child process, to - # clear these logs in the child process. + # These are not expected to be flushed. Calling fork clears any logs not flushed. for _ in range(10): log_record_processor.emit(EMPTY_LOG) - - # The below test also needs this, but it can only be set once. multiprocessing.set_start_method("fork") def child(conn): @@ -604,10 +618,8 @@ def test_batch_log_record_processor_fork_doesnot_deadlock(self): ) def child(conn): - def _target(): + for _ in range(100): log_record_processor.emit(EMPTY_LOG) - - ConcurrencyTestBase.run_with_many_threads(_target, 100) log_record_processor.force_flush() logs = exporter.get_finished_logs() conn.send(len(logs) == 100) @@ -616,8 +628,11 @@ def _target(): parent_conn, child_conn = multiprocessing.Pipe() process = multiprocessing.Process(target=child, args=(child_conn,)) process.start() + process = multiprocessing.Process(target=child, args=(child_conn,)) + process.start() self.assertTrue(parent_conn.recv()) process.join() + self.assertTrue(len(exporter.get_finished_logs()) == 0) def test_batch_log_record_processor_gc(self): # Given a BatchLogRecordProcessor @@ -679,5 +694,4 @@ def formatter(record): # pylint: disable=unused-argument mock_stdout = Mock() exporter = ConsoleLogExporter(out=mock_stdout, formatter=formatter) exporter.export([EMPTY_LOG]) - mock_stdout.write.assert_called_once_with(mock_record_str) From c40c9bfa6bdb336cc25dc6b5e990b491b23cb969 Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Tue, 22 Apr 2025 20:19:51 +0000 Subject: [PATCH 17/21] Add a timeout to export calls --- .../otlp/proto/grpc/_log_exporter/__init__.py | 12 +- .../exporter/otlp/proto/grpc/exporter.py | 137 ++++++--------- .../proto/grpc/metric_exporter/__init__.py | 22 ++- .../proto/grpc/trace_exporter/__init__.py | 14 +- .../test-requirements.txt | 1 + .../tests/test_otlp_exporter_mixin.py | 166 ++++++++++-------- .../tests/test_otlp_metrics_exporter.py | 6 +- .../tests/test_otlp_trace_exporter.py | 10 +- .../otlp/proto/http/_log_exporter/__init__.py | 37 ++-- .../proto/http/metric_exporter/__init__.py | 35 ++-- .../proto/http/trace_exporter/__init__.py | 60 +++---- .../metrics/test_otlp_metrics_exporter.py | 70 +++++--- .../tests/test_proto_log_exporter.py | 68 ++++--- .../tests/test_proto_span_exporter.py | 90 ++++++---- .../sdk/environment_variables/__init__.py | 8 +- 15 files changed, 410 insertions(+), 326 deletions(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/_log_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/_log_exporter/__init__.py index 8f629899d77..b6a286ad27a 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/_log_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/_log_exporter/__init__.py @@ -58,7 +58,7 @@ def __init__( headers: Optional[ Union[TypingSequence[Tuple[str, str]], Dict[str, str], str] ] = None, - timeout: Optional[int] = None, + timeout: Optional[float] = None, compression: Optional[Compression] = None, ): if insecure is None: @@ -79,7 +79,7 @@ def __init__( environ_timeout = environ.get(OTEL_EXPORTER_OTLP_LOGS_TIMEOUT) environ_timeout = ( - int(environ_timeout) if environ_timeout is not None else None + float(environ_timeout) if environ_timeout is not None else None ) compression = ( @@ -107,8 +107,12 @@ def _translate_data( ) -> ExportLogsServiceRequest: return encode_logs(data) - def export(self, batch: Sequence[LogData]) -> LogExportResult: - return self._export(batch) + def export( + self, batch: Sequence[LogData], timeout_millis: Optional[float] = None + ) -> LogExportResult: + return self._export( + batch, timeout_millis / 1e3 if timeout_millis else None + ) def shutdown(self, timeout_millis: float = 30_000, **kwargs) -> None: OTLPExporterMixin.shutdown(self, timeout_millis=timeout_millis) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py index 79270b99a0c..d169d1e5a80 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/exporter.py @@ -14,12 +14,12 @@ """OTLP Exporter""" +import json import threading from abc import ABC, abstractmethod from collections.abc import Sequence # noqa: F401 from logging import getLogger from os import environ -from time import sleep from typing import ( # noqa: F401 Any, Callable, @@ -35,7 +35,6 @@ from urllib.parse import urlparse from deprecated import deprecated -from google.rpc.error_details_pb2 import RetryInfo from grpc import ( ChannelCredentials, @@ -47,7 +46,6 @@ ssl_channel_credentials, ) from opentelemetry.exporter.otlp.proto.common._internal import ( - _create_exp_backoff_generator, _get_resource_data, ) from opentelemetry.exporter.otlp.proto.grpc import ( @@ -74,6 +72,29 @@ from opentelemetry.sdk.trace import ReadableSpan from opentelemetry.util.re import parse_env_headers +json_config = json.dumps( + { + "methodConfig": [ + { + "name": [dict()], + "retryPolicy": { + "maxAttempts": 5, + "initialBackoff": "1s", + "maxBackoff": "64s", + "backoffMultiplier": 2, + "retryableStatusCodes": [ + "UNAVAILABLE", + "CANCELLED", + "RESOURCE_EXHAUSTED", + "ABORTED", + "OUT_OF_RANGE", + "DATA_LOSS", + ], + }, + } + ] + } +) logger = getLogger(__name__) SDKDataT = TypeVar("SDKDataT") ResourceDataT = TypeVar("ResourceDataT") @@ -195,7 +216,7 @@ def __init__( headers: Optional[ Union[TypingSequence[Tuple[str, str]], Dict[str, str], str] ] = None, - timeout: Optional[int] = None, + timeout: Optional[float] = None, compression: Optional[Compression] = None, ): super().__init__() @@ -232,7 +253,7 @@ def __init__( else: self._headers = tuple(self._headers) + tuple(_OTLP_GRPC_HEADERS) - self._timeout = timeout or int( + self._timeout = timeout or float( environ.get(OTEL_EXPORTER_OTLP_TIMEOUT, 10) ) self._collector_kwargs = None @@ -245,7 +266,11 @@ def __init__( if insecure: self._channel = insecure_channel( - self._endpoint, compression=compression + self._endpoint, + compression=compression, + options=[ + ("grpc.service_config", json_config), + ], ) else: credentials = _get_credentials( @@ -255,7 +280,12 @@ def __init__( OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE, ) self._channel = secure_channel( - self._endpoint, credentials, compression=compression + self._endpoint, + credentials, + compression=compression, + options=[ + ("grpc.service_config", json_config), + ], ) self._client = self._stub(self._channel) @@ -269,10 +299,10 @@ def _translate_data( pass def _export( - self, data: Union[TypingSequence[ReadableSpan], MetricsData] + self, + data: Union[TypingSequence[ReadableSpan], MetricsData], + timeout_sec: Optional[float] = None, ) -> ExportResultT: - # After the call to shutdown, subsequent calls to Export are - # not allowed and should return a Failure result. if self._shutdown: logger.warning("Exporter already shutdown, ignoring batch") return self._result.FAILURE @@ -280,79 +310,24 @@ def _export( # FIXME remove this check if the export type for traces # gets updated to a class that represents the proto # TracesData and use the code below instead. - # logger.warning( - # "Transient error %s encountered while exporting %s, retrying in %ss.", - # error.code(), - # data.__class__.__name__, - # delay, - # ) - max_value = 64 - # expo returns a generator that yields delay values which grow - # exponentially. Once delay is greater than max_value, the yielded - # value will remain constant. - for delay in _create_exp_backoff_generator(max_value=max_value): - if delay == max_value or self._shutdown: + with self._export_lock: + try: + self._client.Export( + request=self._translate_data(data), + metadata=self._headers, + timeout=(timeout_sec or self._timeout), + ) + return self._result.SUCCESS + except RpcError as error: + logger.error( + "Failed to export %s to %s, error code: %s", + self._exporting, + self._endpoint, + error.code(), + exc_info=error.code() == StatusCode.UNKNOWN, + ) return self._result.FAILURE - with self._export_lock: - try: - self._client.Export( - request=self._translate_data(data), - metadata=self._headers, - timeout=self._timeout, - ) - - return self._result.SUCCESS - - except RpcError as error: - if error.code() in [ - StatusCode.CANCELLED, - StatusCode.DEADLINE_EXCEEDED, - StatusCode.RESOURCE_EXHAUSTED, - StatusCode.ABORTED, - StatusCode.OUT_OF_RANGE, - StatusCode.UNAVAILABLE, - StatusCode.DATA_LOSS, - ]: - retry_info_bin = dict(error.trailing_metadata()).get( - "google.rpc.retryinfo-bin" - ) - if retry_info_bin is not None: - retry_info = RetryInfo() - retry_info.ParseFromString(retry_info_bin) - delay = ( - retry_info.retry_delay.seconds - + retry_info.retry_delay.nanos / 1.0e9 - ) - - logger.warning( - ( - "Transient error %s encountered while exporting " - "%s to %s, retrying in %ss." - ), - error.code(), - self._exporting, - self._endpoint, - delay, - ) - sleep(delay) - continue - else: - logger.error( - "Failed to export %s to %s, error code: %s", - self._exporting, - self._endpoint, - error.code(), - exc_info=error.code() == StatusCode.UNKNOWN, - ) - - if error.code() == StatusCode.OK: - return self._result.SUCCESS - - return self._result.FAILURE - - return self._result.FAILURE - def shutdown(self, timeout_millis: float = 30_000, **kwargs) -> None: if self._shutdown: logger.warning("Exporter already shutdown, ignoring call") diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/metric_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/metric_exporter/__init__.py index 8580dbb7386..8bd52fe80a9 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/metric_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/metric_exporter/__init__.py @@ -13,10 +13,11 @@ from __future__ import annotations +import time from dataclasses import replace from logging import getLogger from os import environ -from typing import Iterable, List, Tuple, Union +from typing import Iterable, List, Optional, Tuple, Union from typing import Sequence as TypingSequence from grpc import ChannelCredentials, Compression @@ -99,7 +100,7 @@ def __init__( credentials: ChannelCredentials | None = None, headers: Union[TypingSequence[Tuple[str, str]], dict[str, str], str] | None = None, - timeout: int | None = None, + timeout: float | None = None, compression: Compression | None = None, preferred_temporality: dict[type, AggregationTemporality] | None = None, @@ -124,7 +125,7 @@ def __init__( environ_timeout = environ.get(OTEL_EXPORTER_OTLP_METRICS_TIMEOUT) environ_timeout = ( - int(environ_timeout) if environ_timeout is not None else None + float(environ_timeout) if environ_timeout is not None else None ) compression = ( @@ -158,17 +159,22 @@ def _translate_data( def export( self, metrics_data: MetricsData, - timeout_millis: float = 10_000, + timeout_millis: Optional[float] = None, **kwargs, ) -> MetricExportResult: - # TODO(#2663): OTLPExporterMixin should pass timeout to gRPC + timeout_sec = ( + timeout_millis / 1e3 if timeout_millis else self._timeout # pylint: disable=protected-access + ) if self._max_export_batch_size is None: - return self._export(data=metrics_data) + return self._export(metrics_data, timeout_sec) export_result = MetricExportResult.SUCCESS - + deadline_sec = time.time() + timeout_sec for split_metrics_data in self._split_metrics_data(metrics_data): - split_export_result = self._export(data=split_metrics_data) + time_remaining_sec = deadline_sec - time.time() + split_export_result = self._export( + split_metrics_data, time_remaining_sec + ) if split_export_result is MetricExportResult.FAILURE: export_result = MetricExportResult.FAILURE diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/trace_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/trace_exporter/__init__.py index c78c1b81bb6..5303d0fa840 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/trace_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/src/opentelemetry/exporter/otlp/proto/grpc/trace_exporter/__init__.py @@ -91,7 +91,7 @@ def __init__( headers: Optional[ Union[TypingSequence[Tuple[str, str]], Dict[str, str], str] ] = None, - timeout: Optional[int] = None, + timeout: Optional[float] = None, compression: Optional[Compression] = None, ): if insecure is None: @@ -112,7 +112,7 @@ def __init__( environ_timeout = environ.get(OTEL_EXPORTER_OTLP_TRACES_TIMEOUT) environ_timeout = ( - int(environ_timeout) if environ_timeout is not None else None + float(environ_timeout) if environ_timeout is not None else None ) compression = ( @@ -139,8 +139,14 @@ def _translate_data( ) -> ExportTraceServiceRequest: return encode_spans(data) - def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult: - return self._export(spans) + def export( + self, + spans: Sequence[ReadableSpan], + timeout_millis: Optional[float] = None, + ) -> SpanExportResult: + return self._export( + spans, timeout_millis / 1e3 if timeout_millis else None + ) def shutdown(self) -> None: OTLPExporterMixin.shutdown(self) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/test-requirements.txt b/exporter/opentelemetry-exporter-otlp-proto-grpc/test-requirements.txt index 28d778461a9..01c9f1ddadd 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/test-requirements.txt +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/test-requirements.txt @@ -2,6 +2,7 @@ asgiref==3.7.2 Deprecated==1.2.14 googleapis-common-protos==1.63.2 grpcio==1.66.2 +grpcio-status==1.66.0 importlib-metadata==6.11.0 iniconfig==2.0.0 packaging==24.0 diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py index 656d9a6cb79..5a75595f693 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_exporter_mixin.py @@ -15,18 +15,14 @@ import threading import time from concurrent.futures import ThreadPoolExecutor -from logging import WARNING +from logging import WARNING, getLogger from typing import Any, Optional, Sequence from unittest import TestCase -from unittest.mock import Mock, patch +from unittest.mock import ANY, Mock, patch -from google.protobuf.duration_pb2 import ( # pylint: disable=no-name-in-module - Duration, -) -from google.rpc.error_details_pb2 import ( # pylint: disable=no-name-in-module - RetryInfo, -) -from grpc import Compression, StatusCode, server +from google.rpc import code_pb2, status_pb2 +from grpc import Compression, server +from grpc_status import rpc_status from opentelemetry.exporter.otlp.proto.common.trace_encoder import ( encode_spans, @@ -55,6 +51,8 @@ SpanExportResult, ) +logger = getLogger(__name__) + # The below tests use this test SpanExporter and Spans, but are testing the # underlying behavior in the mixin. A MetricExporter or LogExporter could @@ -73,8 +71,14 @@ def _translate_data( ) -> ExportTraceServiceRequest: return encode_spans(data) - def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult: - return self._export(spans) + def export( + self, + spans: Sequence[ReadableSpan], + timeout_millis: Optional[float] = None, + ) -> SpanExportResult: + return self._export( + spans, timeout_millis / 1e3 if timeout_millis else None + ) @property def _exporting(self): @@ -87,40 +91,25 @@ def shutdown(self, timeout_millis=30_000): class TraceServiceServicerWithExportParams(TraceServiceServicer): def __init__( self, - export_result: StatusCode, + export_result: code_pb2, optional_export_sleep: Optional[float] = None, - optional_export_retry_millis: Optional[float] = None, ): self.export_result = export_result self.optional_export_sleep = optional_export_sleep - self.optional_export_retry_millis = optional_export_retry_millis # pylint: disable=invalid-name,unused-argument def Export(self, request, context): + logger.warning("Export Request Recieved") if self.optional_export_sleep: time.sleep(self.optional_export_sleep) - if self.optional_export_retry_millis: - context.send_initial_metadata( - ( - ( - "google.rpc.retryinfo-bin", - RetryInfo().SerializeToString(), - ), - ) - ) - context.set_trailing_metadata( - ( - ( - "google.rpc.retryinfo-bin", - RetryInfo( - retry_delay=Duration( - nanos=int(self.optional_export_retry_millis) - ) - ).SerializeToString(), - ), + if self.export_result != code_pb2.OK: + context.abort_with_status( + rpc_status.to_status( + status_pb2.Status( + code=self.export_result, + ) ) ) - context.set_code(self.export_result) return ExportTraceServiceResponse() @@ -268,7 +257,9 @@ def test_otlp_exporter_otlp_compression_unspecified( """No env or kwarg should be NoCompression""" OTLPSpanExporterForTesting(insecure=True) mock_insecure_channel.assert_called_once_with( - "localhost:4317", compression=Compression.NoCompression + "localhost:4317", + compression=Compression.NoCompression, + options=ANY, ) # pylint: disable=no-self-use, disable=unused-argument @@ -292,12 +283,12 @@ def test_otlp_exporter_otlp_compression_envvar( """Just OTEL_EXPORTER_OTLP_COMPRESSION should work""" OTLPSpanExporterForTesting(insecure=True) mock_insecure_channel.assert_called_once_with( - "localhost:4317", compression=Compression.Gzip + "localhost:4317", compression=Compression.Gzip, options=ANY ) def test_shutdown(self): add_TraceServiceServicer_to_server( - TraceServiceServicerWithExportParams(StatusCode.OK), + TraceServiceServicerWithExportParams(code_pb2.OK), self.server, ) self.assertEqual( @@ -316,7 +307,7 @@ def test_shutdown(self): def test_shutdown_wait_last_export(self): add_TraceServiceServicer_to_server( TraceServiceServicerWithExportParams( - StatusCode.OK, optional_export_sleep=1 + code_pb2.OK, optional_export_sleep=1 ), self.server, ) @@ -337,7 +328,7 @@ def test_shutdown_wait_last_export(self): def test_shutdown_doesnot_wait_last_export(self): add_TraceServiceServicer_to_server( TraceServiceServicerWithExportParams( - StatusCode.OK, optional_export_sleep=3 + code_pb2.OK, optional_export_sleep=3 ), self.server, ) @@ -360,7 +351,7 @@ def test_export_over_closed_grpc_channel(self): # pylint: disable=protected-access add_TraceServiceServicer_to_server( - TraceServiceServicerWithExportParams(StatusCode.OK), + TraceServiceServicerWithExportParams(code_pb2.OK), self.server, ) self.exporter.export([self.span]) @@ -372,52 +363,79 @@ def test_export_over_closed_grpc_channel(self): str(err.exception), "Cannot invoke RPC on closed channel!" ) - @patch( - "opentelemetry.exporter.otlp.proto.grpc.exporter._create_exp_backoff_generator" - ) - @patch("opentelemetry.exporter.otlp.proto.grpc.exporter.sleep") - def test_unavailable(self, mock_sleep, mock_expo): - mock_expo.configure_mock(**{"return_value": [0.01]}) - + def test_retry_timeout(self): add_TraceServiceServicer_to_server( - TraceServiceServicerWithExportParams(StatusCode.UNAVAILABLE), + TraceServiceServicerWithExportParams(code_pb2.UNAVAILABLE), self.server, ) - result = self.exporter.export([self.span]) - self.assertEqual(result, SpanExportResult.FAILURE) - mock_sleep.assert_called_with(0.01) - - @patch("opentelemetry.exporter.otlp.proto.grpc.exporter.sleep") - def test_unavailable_delay(self, mock_sleep): + with self.assertLogs(level=WARNING) as warning: + # Set timeout to 1.5 seconds + self.assertEqual( + self.exporter.export([self.span], 1500), + SpanExportResult.FAILURE, + ) + # Our GRPC retry policy starts with a 1 second backoff then doubles. + # So we expect just two calls: one at time 0, one at time 1. + # The final log is from when export fails. + self.assertEqual(len(warning.records), 3) + for idx, log in enumerate(warning.records): + if idx != 2: + self.assertEqual( + "Export Request Recieved", + log.message, + ) + else: + self.assertEqual( + "Failed to export traces to localhost:4317, error code: StatusCode.DEADLINE_EXCEEDED", + log.message, + ) + with self.assertLogs(level=WARNING) as warning: + exporter = OTLPSpanExporterForTesting(insecure=True, timeout=3.5) + # This time don't pass in a timeout to export, so it should fallback to the timeout + # passed to the exporter class. + # pylint: disable=protected-access + self.assertEqual(exporter._timeout, 3.5) + self.assertEqual( + exporter.export([self.span]), + SpanExportResult.FAILURE, + ) + # We expect 3 calls: time 0, time 1, time 3, but not time 7. + # The final log is from when export fails. + self.assertEqual(len(warning.records), 4) + for idx, log in enumerate(warning.records): + if idx != 3: + self.assertEqual( + "Export Request Recieved", + log.message, + ) + else: + self.assertEqual( + "Failed to export traces to localhost:4317, error code: StatusCode.DEADLINE_EXCEEDED", + log.message, + ) + + def test_timeout_set_correctly(self): add_TraceServiceServicer_to_server( TraceServiceServicerWithExportParams( - StatusCode.UNAVAILABLE, - optional_export_sleep=None, - optional_export_retry_millis=1e7, + code_pb2.OK, optional_export_sleep=0.5 ), self.server, ) + # Should timeout. Deadline should be set to now + timeout. + # That is 400 millis from now, and export sleeps for 500 millis. with self.assertLogs(level=WARNING) as warning: self.assertEqual( - self.exporter.export([self.span]), SpanExportResult.FAILURE + self.exporter.export([self.span], 400), + SpanExportResult.FAILURE, ) - mock_sleep.assert_called_with(0.01) - self.assertEqual( - warning.records[0].message, - ( - "Transient error StatusCode.UNAVAILABLE encountered " - "while exporting traces to localhost:4317, retrying in 0.01s." - ), + "Failed to export traces to localhost:4317, error code: StatusCode.DEADLINE_EXCEEDED", + warning.records[-1].message, ) - def test_success(self): - add_TraceServiceServicer_to_server( - TraceServiceServicerWithExportParams(StatusCode.OK), - self.server, - ) self.assertEqual( - self.exporter.export([self.span]), SpanExportResult.SUCCESS + self.exporter.export([self.span], 600), + SpanExportResult.SUCCESS, ) def test_otlp_headers_from_env(self): @@ -431,15 +449,13 @@ def test_otlp_headers_from_env(self): def test_permanent_failure(self): with self.assertLogs(level=WARNING) as warning: add_TraceServiceServicer_to_server( - TraceServiceServicerWithExportParams( - StatusCode.ALREADY_EXISTS - ), + TraceServiceServicerWithExportParams(code_pb2.ALREADY_EXISTS), self.server, ) self.assertEqual( self.exporter.export([self.span]), SpanExportResult.FAILURE ) self.assertEqual( - warning.records[0].message, + warning.records[-1].message, "Failed to export traces to localhost:4317, error code: StatusCode.ALREADY_EXISTS", ) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py index 2ea12f660fb..ceda6e72a8e 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_metrics_exporter.py @@ -18,7 +18,7 @@ from os.path import dirname from typing import List from unittest import TestCase -from unittest.mock import patch +from unittest.mock import ANY, patch from grpc import ChannelCredentials, Compression @@ -297,7 +297,9 @@ def test_otlp_exporter_otlp_compression_kwarg(self, mock_insecure_channel): insecure=True, compression=Compression.NoCompression ) mock_insecure_channel.assert_called_once_with( - "localhost:4317", compression=Compression.NoCompression + "localhost:4317", + compression=Compression.NoCompression, + options=ANY, ) def test_split_metrics_data_many_data_points(self): diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py index 73d8d6c7a20..ea39a7792d4 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py @@ -16,7 +16,7 @@ import os from unittest import TestCase -from unittest.mock import Mock, PropertyMock, patch +from unittest.mock import Mock, PropertyMock, patch, ANY from grpc import ChannelCredentials, Compression @@ -333,7 +333,9 @@ def test_otlp_exporter_otlp_compression_kwarg(self, mock_insecure_channel): """Specifying kwarg should take precedence over env""" OTLPSpanExporter(insecure=True, compression=Compression.NoCompression) mock_insecure_channel.assert_called_once_with( - "localhost:4317", compression=Compression.NoCompression + "localhost:4317", + compression=Compression.NoCompression, + options=ANY, ) # pylint: disable=no-self-use @@ -350,7 +352,9 @@ def test_otlp_exporter_otlp_compression_precendence( """ OTLPSpanExporter(insecure=True) mock_insecure_channel.assert_called_once_with( - "localhost:4317", compression=Compression.Gzip + "localhost:4317", + compression=Compression.Gzip, + options=ANY, ) def test_translate_spans(self): diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py index f86f0113833..c8530972ade 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/_log_exporter/__init__.py @@ -17,15 +17,12 @@ import zlib from io import BytesIO from os import environ -from time import sleep +from time import sleep, time from typing import Dict, Optional, Sequence import requests from requests.exceptions import ConnectionError -from opentelemetry.exporter.otlp.proto.common._internal import ( - _create_exp_backoff_generator, -) from opentelemetry.exporter.otlp.proto.common._log_encoder import encode_logs from opentelemetry.exporter.otlp.proto.http import ( _OTLP_HTTP_HEADERS, @@ -64,8 +61,6 @@ class OTLPLogExporter(LogExporter): - _MAX_RETRY_TIMEOUT = 64 - def __init__( self, endpoint: Optional[str] = None, @@ -73,7 +68,7 @@ def __init__( client_key_file: Optional[str] = None, client_certificate_file: Optional[str] = None, headers: Optional[Dict[str, str]] = None, - timeout: Optional[int] = None, + timeout: Optional[float] = None, compression: Optional[Compression] = None, session: Optional[requests.Session] = None, ): @@ -108,7 +103,7 @@ def __init__( self._headers = headers or parse_env_headers( headers_string, liberal=True ) - self._timeout = timeout or int( + self._timeout = timeout or float( environ.get( OTEL_EXPORTER_OTLP_LOGS_TIMEOUT, environ.get(OTEL_EXPORTER_OTLP_TIMEOUT, DEFAULT_TIMEOUT), @@ -124,7 +119,7 @@ def __init__( ) self._shutdown = False - def _export(self, serialized_data: bytes): + def _export(self, serialized_data: bytes, timeout_sec: float): data = serialized_data if self._compression == Compression.Gzip: gzip_data = BytesIO() @@ -143,7 +138,7 @@ def _export(self, serialized_data: bytes): url=self._endpoint, data=data, verify=self._certificate_file, - timeout=self._timeout, + timeout=timeout_sec, cert=self._client_cert, ) except ConnectionError: @@ -151,7 +146,7 @@ def _export(self, serialized_data: bytes): url=self._endpoint, data=data, verify=self._certificate_file, - timeout=self._timeout, + timeout=timeout_sec, cert=self._client_cert, ) return resp @@ -164,7 +159,9 @@ def _retryable(resp: requests.Response) -> bool: return True return False - def export(self, batch: Sequence[LogData]) -> LogExportResult: + def export( + self, batch: Sequence[LogData], timeout_millis: Optional[float] = None + ) -> LogExportResult: # After the call to Shutdown subsequent calls to Export are # not allowed and should return a Failure result. if self._shutdown: @@ -172,18 +169,20 @@ def export(self, batch: Sequence[LogData]) -> LogExportResult: return LogExportResult.FAILURE serialized_data = encode_logs(batch).SerializeToString() - - for delay in _create_exp_backoff_generator( - max_value=self._MAX_RETRY_TIMEOUT - ): - if delay == self._MAX_RETRY_TIMEOUT: + deadline_sec = time() + ( + timeout_millis / 1e3 if timeout_millis else self._timeout + ) + for delay in [1, 2, 4, 8, 16, 32]: + remaining_time_sec = deadline_sec - time() + if remaining_time_sec < 1e-09: return LogExportResult.FAILURE - - resp = self._export(serialized_data) + resp = self._export(serialized_data, remaining_time_sec) # pylint: disable=no-else-return if resp.ok: return LogExportResult.SUCCESS elif self._retryable(resp): + if delay > (deadline_sec - time()): + return LogExportResult.FAILURE _logger.warning( "Transient error %s encountered while exporting logs batch, retrying in %ss.", resp.reason, diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py index 4feea8d4302..6c8b930fbc7 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/metric_exporter/__init__.py @@ -17,13 +17,14 @@ import zlib from io import BytesIO from os import environ -from time import sleep +from time import sleep, time from typing import ( # noqa: F401 Any, Callable, Dict, List, Mapping, + Optional, Sequence, ) @@ -32,7 +33,6 @@ from requests.exceptions import ConnectionError from opentelemetry.exporter.otlp.proto.common._internal import ( - _create_exp_backoff_generator, _get_resource_data, ) from opentelemetry.exporter.otlp.proto.common._internal.metrics_encoder import ( @@ -101,8 +101,6 @@ class OTLPMetricExporter(MetricExporter, OTLPMetricExporterMixin): - _MAX_RETRY_TIMEOUT = 64 - def __init__( self, endpoint: str | None = None, @@ -110,7 +108,7 @@ def __init__( client_key_file: str | None = None, client_certificate_file: str | None = None, headers: dict[str, str] | None = None, - timeout: int | None = None, + timeout: float | None = None, compression: Compression | None = None, session: requests.Session | None = None, preferred_temporality: dict[type, AggregationTemporality] @@ -147,7 +145,7 @@ def __init__( self._headers = headers or parse_env_headers( headers_string, liberal=True ) - self._timeout = timeout or int( + self._timeout = timeout or float( environ.get( OTEL_EXPORTER_OTLP_METRICS_TIMEOUT, environ.get(OTEL_EXPORTER_OTLP_TIMEOUT, DEFAULT_TIMEOUT), @@ -166,7 +164,7 @@ def __init__( preferred_temporality, preferred_aggregation ) - def _export(self, serialized_data: bytes): + def _export(self, serialized_data: bytes, timeout_sec: float): data = serialized_data if self._compression == Compression.Gzip: gzip_data = BytesIO() @@ -185,7 +183,7 @@ def _export(self, serialized_data: bytes): url=self._endpoint, data=data, verify=self._certificate_file, - timeout=self._timeout, + timeout=timeout_sec, cert=self._client_cert, ) except ConnectionError: @@ -193,7 +191,7 @@ def _export(self, serialized_data: bytes): url=self._endpoint, data=data, verify=self._certificate_file, - timeout=self._timeout, + timeout=timeout_sec, cert=self._client_cert, ) return resp @@ -209,21 +207,26 @@ def _retryable(resp: requests.Response) -> bool: def export( self, metrics_data: MetricsData, - timeout_millis: float = 10_000, + timeout_millis: Optional[float] = None, **kwargs, ) -> MetricExportResult: serialized_data = encode_metrics(metrics_data) - for delay in _create_exp_backoff_generator( - max_value=self._MAX_RETRY_TIMEOUT - ): - if delay == self._MAX_RETRY_TIMEOUT: + deadline_sec = time() + ( + timeout_millis / 1e3 if timeout_millis else self._timeout + ) + for delay in [1, 2, 4, 8, 16, 32]: + remaining_time_sec = deadline_sec - time() + if remaining_time_sec < 1e-09: return MetricExportResult.FAILURE - - resp = self._export(serialized_data.SerializeToString()) + resp = self._export( + serialized_data.SerializeToString(), remaining_time_sec + ) # pylint: disable=no-else-return if resp.ok: return MetricExportResult.SUCCESS elif self._retryable(resp): + if delay > (deadline_sec - time()): + return MetricExportResult.FAILURE _logger.warning( "Transient error %s encountered while exporting metric batch, retrying in %ss.", resp.reason, diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py index 1841e5210a4..6fa0f1c1bdd 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/src/opentelemetry/exporter/otlp/proto/http/trace_exporter/__init__.py @@ -17,15 +17,12 @@ import zlib from io import BytesIO from os import environ -from time import sleep -from typing import Dict, Optional +from time import sleep, time +from typing import Dict, Optional, Sequence import requests from requests.exceptions import ConnectionError -from opentelemetry.exporter.otlp.proto.common._internal import ( - _create_exp_backoff_generator, -) from opentelemetry.exporter.otlp.proto.common.trace_encoder import ( encode_spans, ) @@ -49,6 +46,7 @@ OTEL_EXPORTER_OTLP_TRACES_HEADERS, OTEL_EXPORTER_OTLP_TRACES_TIMEOUT, ) +from opentelemetry.sdk.trace import ReadableSpan from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult from opentelemetry.util.re import parse_env_headers @@ -62,8 +60,6 @@ class OTLPSpanExporter(SpanExporter): - _MAX_RETRY_TIMEOUT = 64 - def __init__( self, endpoint: Optional[str] = None, @@ -71,7 +67,7 @@ def __init__( client_key_file: Optional[str] = None, client_certificate_file: Optional[str] = None, headers: Optional[Dict[str, str]] = None, - timeout: Optional[int] = None, + timeout: Optional[float] = None, compression: Optional[Compression] = None, session: Optional[requests.Session] = None, ): @@ -105,7 +101,7 @@ def __init__( self._headers = headers or parse_env_headers( headers_string, liberal=True ) - self._timeout = timeout or int( + self._timeout = timeout or float( environ.get( OTEL_EXPORTER_OTLP_TRACES_TIMEOUT, environ.get(OTEL_EXPORTER_OTLP_TIMEOUT, DEFAULT_TIMEOUT), @@ -121,7 +117,7 @@ def __init__( ) self._shutdown = False - def _export(self, serialized_data: bytes): + def _export(self, serialized_data: bytes, timeout_sec: float): data = serialized_data if self._compression == Compression.Gzip: gzip_data = BytesIO() @@ -140,7 +136,7 @@ def _export(self, serialized_data: bytes): url=self._endpoint, data=data, verify=self._certificate_file, - timeout=self._timeout, + timeout=timeout_sec, cert=self._client_cert, ) except ConnectionError: @@ -148,7 +144,7 @@ def _export(self, serialized_data: bytes): url=self._endpoint, data=data, verify=self._certificate_file, - timeout=self._timeout, + timeout=timeout_sec, cert=self._client_cert, ) return resp @@ -161,21 +157,32 @@ def _retryable(resp: requests.Response) -> bool: return True return False - def _serialize_spans(self, spans): - return encode_spans(spans).SerializePartialToString() + def export( + self, + spans: Sequence[ReadableSpan], + timeout_millis: Optional[float] = None, + ) -> SpanExportResult: + # After the call to Shutdown subsequent calls to Export are + # not allowed and should return a Failure result. + if self._shutdown: + _logger.warning("Exporter already shutdown, ignoring batch") + return SpanExportResult.FAILURE - def _export_serialized_spans(self, serialized_data): - for delay in _create_exp_backoff_generator( - max_value=self._MAX_RETRY_TIMEOUT - ): - if delay == self._MAX_RETRY_TIMEOUT: + serialized_data = encode_spans(spans).SerializePartialToString() + deadline_sec = time() + ( + timeout_millis / 1e3 if timeout_millis else self._timeout + ) + for delay in [1, 2, 4, 8, 16, 32]: + remaining_time_sec = deadline_sec - time() + if remaining_time_sec < 1e-09: return SpanExportResult.FAILURE - - resp = self._export(serialized_data) + resp = self._export(serialized_data, remaining_time_sec) # pylint: disable=no-else-return if resp.ok: return SpanExportResult.SUCCESS elif self._retryable(resp): + if delay > (deadline_sec - time()): + return SpanExportResult.FAILURE _logger.warning( "Transient error %s encountered while exporting span batch, retrying in %ss.", resp.reason, @@ -192,17 +199,6 @@ def _export_serialized_spans(self, serialized_data): return SpanExportResult.FAILURE return SpanExportResult.FAILURE - def export(self, spans) -> SpanExportResult: - # After the call to Shutdown subsequent calls to Export are - # not allowed and should return a Failure result. - if self._shutdown: - _logger.warning("Exporter already shutdown, ignoring batch") - return SpanExportResult.FAILURE - - serialized_data = self._serialize_spans(spans) - - return self._export_serialized_spans(serialized_data) - def shutdown(self): if self._shutdown: _logger.warning("Exporter already shutdown, ignoring call") diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py index 16bb3e54286..df7c0c17ea3 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/metrics/test_otlp_metrics_exporter.py @@ -15,11 +15,10 @@ from logging import WARNING from os import environ from unittest import TestCase -from unittest.mock import MagicMock, Mock, call, patch +from unittest.mock import ANY, MagicMock, Mock, patch from requests import Session from requests.models import Response -from responses import POST, activate, add from opentelemetry.exporter.otlp.proto.common.metrics_encoder import ( encode_metrics, @@ -327,31 +326,10 @@ def test_serialization(self, mock_post): url=exporter._endpoint, data=serialized_data.SerializeToString(), verify=exporter._certificate_file, - timeout=exporter._timeout, + timeout=ANY, # Timeout is a float based on real time, can't put an exact value here. cert=exporter._client_cert, ) - @activate - @patch("opentelemetry.exporter.otlp.proto.http.metric_exporter.sleep") - def test_exponential_backoff(self, mock_sleep): - # return a retryable error - add( - POST, - "http://metrics.example.com/export", - json={"error": "something exploded"}, - status=500, - ) - - exporter = OTLPMetricExporter( - endpoint="http://metrics.example.com/export" - ) - metrics_data = self.metrics["sum_int"] - - exporter.export(metrics_data) - mock_sleep.assert_has_calls( - [call(1), call(2), call(4), call(8), call(16), call(32)] - ) - def test_aggregation_temporality(self): otlp_metric_exporter = OTLPMetricExporter() @@ -523,3 +501,47 @@ def test_preferred_aggregation_override(self): self.assertEqual( exporter._preferred_aggregation[Histogram], histogram_aggregation ) + + @patch.object(Session, "post") + def test_retry_timeout(self, mock_post): + exporter = OTLPMetricExporter(timeout=3.5) + + resp = Response() + resp.status_code = 503 + resp.reason = "UNAVAILABLE" + mock_post.return_value = resp + with self.assertLogs(level=WARNING) as warning: + # Set timeout to 1.5 seconds + self.assertEqual( + exporter.export(self.metrics["sum_int"], 1500), + MetricExportResult.FAILURE, + ) + # Code should return failure before the final retry which would exceed timeout. + # Code should return failure after retrying once. + self.assertEqual(len(warning.records), 1) + self.assertEqual( + "Transient error UNAVAILABLE encountered while exporting metric batch, retrying in 1s.", + warning.records[0].message, + ) + with self.assertLogs(level=WARNING) as warning: + # This time don't pass in a timeout, so it will fallback to 3.5 second set on class. + self.assertEqual( + exporter.export(self.metrics["sum_int"]), + MetricExportResult.FAILURE, + ) + # 2 retrys (after 1s, 3s). + self.assertEqual(len(warning.records), 2) + + @patch.object(Session, "post") + def test_timeout_set_correctly(self, mock_post): + resp = Response() + resp.status_code = 200 + + def export_side_effect(*args, **kwargs): + # Timeout should be set to something slightly less than 400 milliseconds depending on how much time has passed. + self.assertTrue(0.4 - kwargs["timeout"] < 0.0005) + return resp + + mock_post.side_effect = export_side_effect + exporter = OTLPMetricExporter() + exporter.export(self.metrics["sum_int"], 400) diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py index 66b0f890d76..00a00ae3aa9 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_log_exporter.py @@ -15,12 +15,14 @@ # pylint: disable=protected-access import unittest +from logging import WARNING from typing import List -from unittest.mock import MagicMock, Mock, call, patch +from unittest.mock import MagicMock, Mock, patch import requests -import responses from google.protobuf.json_format import MessageToDict +from requests import Session +from requests.models import Response from opentelemetry._logs import SeverityNumber from opentelemetry.exporter.otlp.proto.http import Compression @@ -267,25 +269,6 @@ def test_exported_log_without_span_id(self): else: self.fail("No log records found") - @responses.activate - @patch("opentelemetry.exporter.otlp.proto.http._log_exporter.sleep") - def test_exponential_backoff(self, mock_sleep): - # return a retryable error - responses.add( - responses.POST, - "http://logs.example.com/export", - json={"error": "something exploded"}, - status=500, - ) - - exporter = OTLPLogExporter(endpoint="http://logs.example.com/export") - logs = self._get_sdk_log_data() - - exporter.export(logs) - mock_sleep.assert_has_calls( - [call(1), call(2), call(4), call(8), call(16), call(32)] - ) - @staticmethod def _get_sdk_log_data() -> List[LogData]: log1 = LogData( @@ -365,3 +348,46 @@ def test_2xx_status_code(self, mock_otlp_metric_exporter): self.assertEqual( OTLPLogExporter().export(MagicMock()), LogExportResult.SUCCESS ) + + @patch.object(Session, "post") + def test_retry_timeout(self, mock_post): + exporter = OTLPLogExporter(timeout=3.5) + + resp = Response() + resp.status_code = 503 + resp.reason = "UNAVAILABLE" + mock_post.return_value = resp + with self.assertLogs(level=WARNING) as warning: + # Set timeout to 1.5 seconds + self.assertEqual( + exporter.export(self._get_sdk_log_data(), 1500), + LogExportResult.FAILURE, + ) + # Code should return failure after retrying once. + self.assertEqual(len(warning.records), 1) + self.assertEqual( + "Transient error UNAVAILABLE encountered while exporting logs batch, retrying in 1s.", + warning.records[0].message, + ) + with self.assertLogs(level=WARNING) as warning: + # This time don't pass in a timeout, so it will fallback to 3.5 second set on class. + self.assertEqual( + exporter.export(self._get_sdk_log_data()), + LogExportResult.FAILURE, + ) + # 2 retrys (after 1s, 3s). + self.assertEqual(len(warning.records), 2) + + @patch.object(Session, "post") + def test_timeout_set_correctly(self, mock_post): + resp = Response() + resp.status_code = 200 + + def export_side_effect(*args, **kwargs): + # Timeout should be set to something slightly less than 400 milliseconds depending on how much time has passed. + self.assertTrue(0.4 - kwargs["timeout"] < 0.0005) + return resp + + mock_post.side_effect = export_side_effect + exporter = OTLPLogExporter() + exporter.export(self._get_sdk_log_data(), 400) diff --git a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py index 8d8ff6037aa..b7e357bbe4c 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-http/tests/test_proto_span_exporter.py @@ -13,10 +13,12 @@ # limitations under the License. import unittest -from unittest.mock import MagicMock, Mock, call, patch +from logging import WARNING +from unittest.mock import MagicMock, Mock, patch import requests -import responses +from requests import Session +from requests.models import Response from opentelemetry.exporter.otlp.proto.http import Compression from opentelemetry.exporter.otlp.proto.http.trace_exporter import ( @@ -52,6 +54,16 @@ OS_ENV_CLIENT_KEY = "os/env/client-key.pem" OS_ENV_HEADERS = "envHeader1=val1,envHeader2=val2" OS_ENV_TIMEOUT = "30" +BASIC_SPAN = _Span( + "abc", + context=Mock( + **{ + "trace_state": {"a": "b", "c": "d"}, + "span_id": 10217189687419569865, + "trace_id": 67545097771067222548457157018666467027, + } + ), +) # pylint: disable=protected-access @@ -227,37 +239,6 @@ def test_headers_parse_from_env(self): ), ) - # pylint: disable=no-self-use - @responses.activate - @patch("opentelemetry.exporter.otlp.proto.http.trace_exporter.sleep") - def test_exponential_backoff(self, mock_sleep): - # return a retryable error - responses.add( - responses.POST, - "http://traces.example.com/export", - json={"error": "something exploded"}, - status=500, - ) - - exporter = OTLPSpanExporter( - endpoint="http://traces.example.com/export" - ) - span = _Span( - "abc", - context=Mock( - **{ - "trace_state": {"a": "b", "c": "d"}, - "span_id": 10217189687419569865, - "trace_id": 67545097771067222548457157018666467027, - } - ), - ) - - exporter.export([span]) - mock_sleep.assert_has_calls( - [call(1), call(2), call(4), call(8), call(16), call(32)] - ) - @patch.object(OTLPSpanExporter, "_export", return_value=Mock(ok=True)) def test_2xx_status_code(self, mock_otlp_metric_exporter): """ @@ -267,3 +248,46 @@ def test_2xx_status_code(self, mock_otlp_metric_exporter): self.assertEqual( OTLPSpanExporter().export(MagicMock()), SpanExportResult.SUCCESS ) + + @patch.object(Session, "post") + def test_retry_timeout(self, mock_post): + exporter = OTLPSpanExporter(timeout=3.5) + + resp = Response() + resp.status_code = 503 + resp.reason = "UNAVAILABLE" + mock_post.return_value = resp + with self.assertLogs(level=WARNING) as warning: + # Set timeout to 1.5 seconds + self.assertEqual( + exporter.export([BASIC_SPAN], 1500), + SpanExportResult.FAILURE, + ) + # Code should return failure after retrying once. + self.assertEqual(len(warning.records), 1) + self.assertEqual( + "Transient error UNAVAILABLE encountered while exporting span batch, retrying in 1s.", + warning.records[0].message, + ) + with self.assertLogs(level=WARNING) as warning: + # This time don't pass in a timeout, so it will fallback to 3.5 second set on class. + self.assertEqual( + exporter.export([BASIC_SPAN]), + SpanExportResult.FAILURE, + ) + # 2 retrys (after 1s, 3s). + self.assertEqual(len(warning.records), 2) + + @patch.object(Session, "post") + def test_timeout_set_correctly(self, mock_post): + resp = Response() + resp.status_code = 200 + + def export_side_effect(*args, **kwargs): + # Timeout should be set to something slightly less than 400 milliseconds depending on how much time has passed. + self.assertTrue(0.4 - kwargs["timeout"] < 0.0005) + return resp + + mock_post.side_effect = export_side_effect + exporter = OTLPSpanExporter() + exporter.export([BASIC_SPAN], 400) diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/environment_variables/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/environment_variables/__init__.py index 23b634fcd85..038cb95a78b 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/environment_variables/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/environment_variables/__init__.py @@ -334,7 +334,7 @@ """ .. envvar:: OTEL_EXPORTER_OTLP_TIMEOUT -The :envvar:`OTEL_EXPORTER_OTLP_TIMEOUT` is the maximum time the OTLP exporter will wait for each batch export. +The :envvar:`OTEL_EXPORTER_OTLP_TIMEOUT` is the maximum number of seconds the OTLP exporter will wait for each batch export. Default: 10 """ @@ -536,7 +536,7 @@ """ .. envvar:: OTEL_EXPORTER_OTLP_TRACES_TIMEOUT -The :envvar:`OTEL_EXPORTER_OTLP_TRACES_TIMEOUT` is the maximum time the OTLP exporter will +The :envvar:`OTEL_EXPORTER_OTLP_TRACES_TIMEOUT` is the maximum number of seconds the OTLP exporter will wait for each batch export for spans. """ @@ -544,7 +544,7 @@ """ .. envvar:: OTEL_EXPORTER_OTLP_METRICS_TIMEOUT -The :envvar:`OTEL_EXPORTER_OTLP_METRICS_TIMEOUT` is the maximum time the OTLP exporter will +The :envvar:`OTEL_EXPORTER_OTLP_METRICS_TIMEOUT` is the maximum number of seconds the OTLP exporter will wait for each batch export for metrics. """ @@ -578,7 +578,7 @@ """ .. envvar:: OTEL_EXPORTER_OTLP_LOGS_TIMEOUT -The :envvar:`OTEL_EXPORTER_OTLP_LOGS_TIMEOUT` is the maximum time the OTLP exporter will +The :envvar:`OTEL_EXPORTER_OTLP_LOGS_TIMEOUT` is the maximum number of seconds the OTLP exporter will wait for each batch export for logs. """ From b299c7bd822a1d5e9a39f4b8d5e395b3306f4779 Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Mon, 28 Apr 2025 19:23:56 +0000 Subject: [PATCH 18/21] Add timeout millis param to export. --- .../tests/test_otlp_trace_exporter.py | 2 +- .../sdk/_logs/_internal/export/__init__.py | 15 +++++++++++++-- .../_internal/export/in_memory_log_exporter.py | 1 + .../opentelemetry/sdk/trace/export/__init__.py | 11 +++++++++-- 4 files changed, 24 insertions(+), 5 deletions(-) diff --git a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py index ea39a7792d4..5238dc91224 100644 --- a/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py +++ b/exporter/opentelemetry-exporter-otlp-proto-grpc/tests/test_otlp_trace_exporter.py @@ -16,7 +16,7 @@ import os from unittest import TestCase -from unittest.mock import Mock, PropertyMock, patch, ANY +from unittest.mock import ANY, Mock, PropertyMock, patch from grpc import ChannelCredentials, Compression diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py index 254c5f6b96d..17f9e39f015 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py @@ -72,11 +72,14 @@ class LogExporter(abc.ABC): """ @abc.abstractmethod - def export(self, batch: Sequence[LogData]): + def export( + self, batch: Sequence[LogData], timeout_millis: Optional[int] = None + ): """Exports a batch of logs. Args: - batch: The list of `LogData` objects to be exported + batch: The list of `LogData` objects to be exported. + timeout_millis: Optional milliseconds until Export should timeout if it hasn't succeded. Returns: The result of the export @@ -89,6 +92,13 @@ def shutdown(self): Called when the SDK is shut down. """ + @abc.abstractmethod + def force_flush(self, timeout_millis: int = 30000) -> bool: + """Hint to ensure that the export of any spans the exporter has received + prior to the call to ForceFlush SHOULD be completed as soon as possible, preferably + before returning from this method. + """ + class ConsoleLogExporter(LogExporter): """Implementation of :class:`LogExporter` that prints log records to the @@ -107,6 +117,7 @@ def __init__( self.out = out self.formatter = formatter + # pylint: disable=arguments-differ def export(self, batch: Sequence[LogData]): for data in batch: self.out.write(self.formatter(data.log_record)) diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/in_memory_log_exporter.py b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/in_memory_log_exporter.py index 68cb6b7389a..910e2cb17c2 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/in_memory_log_exporter.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/in_memory_log_exporter.py @@ -40,6 +40,7 @@ def get_finished_logs(self) -> typing.Tuple[LogData, ...]: with self._lock: return tuple(self._logs) + # pylint: disable=arguments-differ def export(self, batch: typing.Sequence[LogData]) -> LogExportResult: if self._stopped: return LogExportResult.FAILURE diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/__init__.py index 9e60d6cff9b..006d8038375 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/trace/export/__init__.py @@ -13,6 +13,7 @@ # limitations under the License. from __future__ import annotations +import abc import collections import logging import os @@ -56,7 +57,7 @@ class SpanExportResult(Enum): FAILURE = 1 -class SpanExporter: +class SpanExporter(abc.ABC): """Interface for exporting spans. Interface to be implemented by services that want to export spans recorded @@ -66,24 +67,30 @@ class SpanExporter: `SimpleSpanProcessor` or a `BatchSpanProcessor`. """ + @abc.abstractmethod def export( - self, spans: typing.Sequence[ReadableSpan] + self, + spans: typing.Sequence[ReadableSpan], + timeout_millis: typing.Optional[int] = None, ) -> "SpanExportResult": """Exports a batch of telemetry data. Args: spans: The list of `opentelemetry.trace.Span` objects to be exported + timeout_millis: Optional milliseconds until Export should timeout if it hasn't succeded. Returns: The result of the export """ + @abc.abstractmethod def shutdown(self) -> None: """Shuts down the exporter. Called when the SDK is shut down. """ + @abc.abstractmethod def force_flush(self, timeout_millis: int = 30000) -> bool: """Hint to ensure that the export of any spans the exporter has received prior to the call to ForceFlush SHOULD be completed as soon as possible, preferably From 1233e24c810ff9602b7a80d2f6d3c7f89f2bf11e Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Wed, 9 Apr 2025 18:35:31 +0000 Subject: [PATCH 19/21] Refactor BatchLogRecordProcessor --- .../sdk/_logs/_internal/export/__init__.py | 21 +++++------- opentelemetry-sdk/tests/logs/test_export.py | 34 ++++++++----------- 2 files changed, 24 insertions(+), 31 deletions(-) diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py index 17f9e39f015..39452d4cbc1 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py @@ -205,13 +205,11 @@ def __init__( BatchLogRecordProcessor._validate_arguments( max_queue_size, schedule_delay_millis, max_export_batch_size ) - self._exporter = exporter self._max_queue_size = max_queue_size self._schedule_delay = schedule_delay_millis / 1e3 self._max_export_batch_size = max_export_batch_size # Not used. No way currently to pass timeout to export. - # TODO(https://github.com/open-telemetry/opentelemetry-python/issues/4555): figure out what this should do. self._export_timeout_millis = export_timeout_millis # Deque is thread safe. self._queue = collections.deque([], max_queue_size) @@ -220,10 +218,9 @@ def __init__( target=self.worker, daemon=True, ) - self._shutdown = False self._export_lock = threading.Lock() - self._worker_awaken = threading.Event() + self._worker_sleep = threading.Event() self._worker_thread.start() if hasattr(os, "register_at_fork"): weak_reinit = weakref.WeakMethod(self._at_fork_reinit) @@ -238,15 +235,15 @@ def _should_export_batch( # Always continue to export while queue length exceeds max batch size. if len(self._queue) >= self._max_export_batch_size: return True - if batch_strategy is BatchLogExportStrategy.EXPORT_ALL: + if batch_strategy == BatchLogExportStrategy.EXPORT_ALL: return True - if batch_strategy is BatchLogExportStrategy.EXPORT_AT_LEAST_ONE_BATCH: + if batch_strategy == BatchLogExportStrategy.EXPORT_AT_LEAST_ONE_BATCH: return num_iterations == 0 return False def _at_fork_reinit(self): self._export_lock = threading.Lock() - self._worker_awaken = threading.Event() + self._worker_sleep = threading.Event() self._queue.clear() self._worker_thread = threading.Thread( name="OtelBatchLogRecordProcessor", @@ -261,7 +258,7 @@ def worker(self): # Lots of strategies in the spec for setting next timeout. # https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/sdk.md#batching-processor. # Shutdown will interrupt this sleep. Emit will interrupt this sleep only if the queue is bigger then threshold. - sleep_interrupted = self._worker_awaken.wait(self._schedule_delay) + sleep_interrupted = self._worker_sleep.wait(self._schedule_delay) if self._shutdown: break self._export( @@ -269,7 +266,7 @@ def worker(self): if sleep_interrupted else BatchLogExportStrategy.EXPORT_AT_LEAST_ONE_BATCH ) - self._worker_awaken.clear() + self._worker_sleep.clear() self._export(BatchLogExportStrategy.EXPORT_ALL) def _export(self, batch_strategy: BatchLogExportStrategy) -> None: @@ -299,7 +296,7 @@ def _export(self, batch_strategy: BatchLogExportStrategy) -> None: def emit(self, log_data: LogData) -> None: if self._shutdown: - _logger.info("Shutdown called, ignoring log.") + _logger.warning("Shutdown called, ignoring log.") return if self._pid != os.getpid(): _BSP_RESET_ONCE.do_once(self._at_fork_reinit) @@ -308,7 +305,7 @@ def emit(self, log_data: LogData) -> None: _logger.warning("Queue full, dropping log.") self._queue.appendleft(log_data) if len(self._queue) >= self._max_export_batch_size: - self._worker_awaken.set() + self._worker_sleep.set() def shutdown(self): if self._shutdown: @@ -316,7 +313,7 @@ def shutdown(self): # Prevents emit and force_flush from further calling export. self._shutdown = True # Interrupts sleep in the worker, if it's sleeping. - self._worker_awaken.set() + self._worker_sleep.set() # Main worker loop should exit after one final export call with flush all strategy. self._worker_thread.join() self._exporter.shutdown() diff --git a/opentelemetry-sdk/tests/logs/test_export.py b/opentelemetry-sdk/tests/logs/test_export.py index 6511b137a92..01038e08ade 100644 --- a/opentelemetry-sdk/tests/logs/test_export.py +++ b/opentelemetry-sdk/tests/logs/test_export.py @@ -21,7 +21,7 @@ import unittest import weakref from concurrent.futures import ThreadPoolExecutor -from unittest.mock import Mock, patch +from unittest.mock import Mock, call, patch from opentelemetry._logs import SeverityNumber from opentelemetry.sdk import trace @@ -46,7 +46,6 @@ ) from opentelemetry.sdk.resources import Resource as SDKResource from opentelemetry.sdk.util.instrumentation import InstrumentationScope -from opentelemetry.test.concurrency_test import ConcurrencyTestBase from opentelemetry.trace import TraceFlags from opentelemetry.trace.span import INVALID_SPAN_CONTEXT @@ -487,20 +486,24 @@ def test_logs_exported_once_batch_size_reached(self): exporter.export.assert_called_once() after_export = time.time_ns() # Shows the worker's 30 second sleep was interrupted within a second. - self.assertLess(after_export - before_export, 1e9) + self.assertTrue((after_export - before_export) < 1e9) # pylint: disable=no-self-use def test_logs_exported_once_schedule_delay_reached(self): exporter = Mock() log_record_processor = BatchLogRecordProcessor( exporter=exporter, + # Should not reach this during the test, instead export should be called when delay millis is hit. max_queue_size=15, max_export_batch_size=15, schedule_delay_millis=100, ) - log_record_processor.emit(EMPTY_LOG) - time.sleep(0.2) - exporter.export.assert_called_once_with([EMPTY_LOG]) + for _ in range(15): + log_record_processor.emit(EMPTY_LOG) + time.sleep(0.11) + exporter.export.assert_has_calls( + [call([EMPTY_LOG]) for _ in range(15)] + ) def test_logs_flushed_before_shutdown_and_dropped_after_shutdown(self): exporter = Mock() @@ -517,13 +520,13 @@ def test_logs_flushed_before_shutdown_and_dropped_after_shutdown(self): exporter.export.assert_called_once_with([EMPTY_LOG]) self.assertTrue(exporter._stopped) - with self.assertLogs(level="INFO") as log: + with self.assertLogs(level="WARNING") as log: # This log should not be flushed. log_record_processor.emit(EMPTY_LOG) self.assertEqual(len(log.output), 1) self.assertEqual(len(log.records), 1) self.assertIn("Shutdown called, ignoring log.", log.output[0]) - exporter.export.assert_called_once() + exporter.export.assert_called_once_with([EMPTY_LOG]) # pylint: disable=no-self-use def test_force_flush_flushes_logs(self): @@ -552,7 +555,6 @@ def bulk_log_and_flush(num_logs): with ThreadPoolExecutor(max_workers=69) as executor: for idx in range(69): executor.submit(bulk_log_and_flush, idx + 1) - executor.shutdown() finished_logs = exporter.get_finished_logs() @@ -562,20 +564,16 @@ def bulk_log_and_flush(num_logs): hasattr(os, "fork"), "needs *nix", ) - def test_batch_log_record_processor_fork_clears_logs_from_child(self): + def test_batch_log_record_processor_fork(self): exporter = InMemoryLogExporter() log_record_processor = BatchLogRecordProcessor( exporter, max_export_batch_size=64, schedule_delay_millis=30000, ) - # These logs should be flushed only from the parent process. - # _at_fork_reinit should be called in the child process, to - # clear these logs in the child process. + # These are not expected to be flushed. Calling fork clears any logs not flushed. for _ in range(10): log_record_processor.emit(EMPTY_LOG) - - # The below test also needs this, but it can only be set once. multiprocessing.set_start_method("fork") def child(conn): @@ -605,10 +603,8 @@ def test_batch_log_record_processor_fork_doesnot_deadlock(self): ) def child(conn): - def _target(): + for _ in range(100): log_record_processor.emit(EMPTY_LOG) - - ConcurrencyTestBase.run_with_many_threads(_target, 100) log_record_processor.force_flush() logs = exporter.get_finished_logs() conn.send(len(logs) == 100) @@ -619,6 +615,7 @@ def _target(): process.start() self.assertTrue(parent_conn.recv()) process.join() + self.assertTrue(len(exporter.get_finished_logs()) == 0) def test_batch_log_record_processor_gc(self): # Given a BatchLogRecordProcessor @@ -680,5 +677,4 @@ def formatter(record): # pylint: disable=unused-argument mock_stdout = Mock() exporter = ConsoleLogExporter(out=mock_stdout, formatter=formatter) exporter.export([EMPTY_LOG]) - mock_stdout.write.assert_called_once_with(mock_record_str) From ed344a93995dc8cf9ec3a22781a650501a04d2f0 Mon Sep 17 00:00:00 2001 From: DylanRussell Date: Thu, 24 Apr 2025 12:50:50 -0400 Subject: [PATCH 20/21] Refactor BatchLogRecordProcessor and associated tests (#4535) --- .../sdk/_logs/_internal/export/__init__.py | 20 ++++++------ opentelemetry-sdk/tests/logs/test_export.py | 31 ++++++++++--------- 2 files changed, 28 insertions(+), 23 deletions(-) diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py index 39452d4cbc1..223e27d6af7 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py @@ -210,6 +210,7 @@ def __init__( self._schedule_delay = schedule_delay_millis / 1e3 self._max_export_batch_size = max_export_batch_size # Not used. No way currently to pass timeout to export. + # TODO(https://github.com/open-telemetry/opentelemetry-python/issues/4555): figure out what this should do. self._export_timeout_millis = export_timeout_millis # Deque is thread safe. self._queue = collections.deque([], max_queue_size) @@ -218,9 +219,10 @@ def __init__( target=self.worker, daemon=True, ) + self._shutdown = False self._export_lock = threading.Lock() - self._worker_sleep = threading.Event() + self._worker_awaken = threading.Event() self._worker_thread.start() if hasattr(os, "register_at_fork"): weak_reinit = weakref.WeakMethod(self._at_fork_reinit) @@ -235,15 +237,15 @@ def _should_export_batch( # Always continue to export while queue length exceeds max batch size. if len(self._queue) >= self._max_export_batch_size: return True - if batch_strategy == BatchLogExportStrategy.EXPORT_ALL: + if batch_strategy is BatchLogExportStrategy.EXPORT_ALL: return True - if batch_strategy == BatchLogExportStrategy.EXPORT_AT_LEAST_ONE_BATCH: + if batch_strategy is BatchLogExportStrategy.EXPORT_AT_LEAST_ONE_BATCH: return num_iterations == 0 return False def _at_fork_reinit(self): self._export_lock = threading.Lock() - self._worker_sleep = threading.Event() + self._worker_awaken = threading.Event() self._queue.clear() self._worker_thread = threading.Thread( name="OtelBatchLogRecordProcessor", @@ -258,7 +260,7 @@ def worker(self): # Lots of strategies in the spec for setting next timeout. # https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/sdk.md#batching-processor. # Shutdown will interrupt this sleep. Emit will interrupt this sleep only if the queue is bigger then threshold. - sleep_interrupted = self._worker_sleep.wait(self._schedule_delay) + sleep_interrupted = self._worker_awaken.wait(self._schedule_delay) if self._shutdown: break self._export( @@ -266,7 +268,7 @@ def worker(self): if sleep_interrupted else BatchLogExportStrategy.EXPORT_AT_LEAST_ONE_BATCH ) - self._worker_sleep.clear() + self._worker_awaken.clear() self._export(BatchLogExportStrategy.EXPORT_ALL) def _export(self, batch_strategy: BatchLogExportStrategy) -> None: @@ -296,7 +298,7 @@ def _export(self, batch_strategy: BatchLogExportStrategy) -> None: def emit(self, log_data: LogData) -> None: if self._shutdown: - _logger.warning("Shutdown called, ignoring log.") + _logger.info("Shutdown called, ignoring log.") return if self._pid != os.getpid(): _BSP_RESET_ONCE.do_once(self._at_fork_reinit) @@ -305,7 +307,7 @@ def emit(self, log_data: LogData) -> None: _logger.warning("Queue full, dropping log.") self._queue.appendleft(log_data) if len(self._queue) >= self._max_export_batch_size: - self._worker_sleep.set() + self._worker_awaken.set() def shutdown(self): if self._shutdown: @@ -313,7 +315,7 @@ def shutdown(self): # Prevents emit and force_flush from further calling export. self._shutdown = True # Interrupts sleep in the worker, if it's sleeping. - self._worker_sleep.set() + self._worker_awaken.set() # Main worker loop should exit after one final export call with flush all strategy. self._worker_thread.join() self._exporter.shutdown() diff --git a/opentelemetry-sdk/tests/logs/test_export.py b/opentelemetry-sdk/tests/logs/test_export.py index 01038e08ade..6640246f4bc 100644 --- a/opentelemetry-sdk/tests/logs/test_export.py +++ b/opentelemetry-sdk/tests/logs/test_export.py @@ -486,24 +486,20 @@ def test_logs_exported_once_batch_size_reached(self): exporter.export.assert_called_once() after_export = time.time_ns() # Shows the worker's 30 second sleep was interrupted within a second. - self.assertTrue((after_export - before_export) < 1e9) + self.assertLess(after_export - before_export, 1e9) # pylint: disable=no-self-use def test_logs_exported_once_schedule_delay_reached(self): exporter = Mock() log_record_processor = BatchLogRecordProcessor( exporter=exporter, - # Should not reach this during the test, instead export should be called when delay millis is hit. max_queue_size=15, max_export_batch_size=15, schedule_delay_millis=100, ) - for _ in range(15): - log_record_processor.emit(EMPTY_LOG) - time.sleep(0.11) - exporter.export.assert_has_calls( - [call([EMPTY_LOG]) for _ in range(15)] - ) + log_record_processor.emit(EMPTY_LOG) + time.sleep(0.2) + exporter.export.assert_called_once_with([EMPTY_LOG]) def test_logs_flushed_before_shutdown_and_dropped_after_shutdown(self): exporter = Mock() @@ -520,13 +516,13 @@ def test_logs_flushed_before_shutdown_and_dropped_after_shutdown(self): exporter.export.assert_called_once_with([EMPTY_LOG]) self.assertTrue(exporter._stopped) - with self.assertLogs(level="WARNING") as log: + with self.assertLogs(level="INFO") as log: # This log should not be flushed. log_record_processor.emit(EMPTY_LOG) self.assertEqual(len(log.output), 1) self.assertEqual(len(log.records), 1) self.assertIn("Shutdown called, ignoring log.", log.output[0]) - exporter.export.assert_called_once_with([EMPTY_LOG]) + exporter.export.assert_called_once() # pylint: disable=no-self-use def test_force_flush_flushes_logs(self): @@ -555,6 +551,7 @@ def bulk_log_and_flush(num_logs): with ThreadPoolExecutor(max_workers=69) as executor: for idx in range(69): executor.submit(bulk_log_and_flush, idx + 1) + executor.shutdown() finished_logs = exporter.get_finished_logs() @@ -564,16 +561,20 @@ def bulk_log_and_flush(num_logs): hasattr(os, "fork"), "needs *nix", ) - def test_batch_log_record_processor_fork(self): + def test_batch_log_record_processor_fork_clears_logs_from_child(self): exporter = InMemoryLogExporter() log_record_processor = BatchLogRecordProcessor( exporter, max_export_batch_size=64, schedule_delay_millis=30000, ) - # These are not expected to be flushed. Calling fork clears any logs not flushed. + # These logs should be flushed only from the parent process. + # _at_fork_reinit should be called in the child process, to + # clear these logs in the child process. for _ in range(10): log_record_processor.emit(EMPTY_LOG) + + # The below test also needs this, but it can only be set once. multiprocessing.set_start_method("fork") def child(conn): @@ -603,8 +604,10 @@ def test_batch_log_record_processor_fork_doesnot_deadlock(self): ) def child(conn): - for _ in range(100): + def _target(): log_record_processor.emit(EMPTY_LOG) + + ConcurrencyTestBase.run_with_many_threads(_target, 100) log_record_processor.force_flush() logs = exporter.get_finished_logs() conn.send(len(logs) == 100) @@ -615,7 +618,6 @@ def child(conn): process.start() self.assertTrue(parent_conn.recv()) process.join() - self.assertTrue(len(exporter.get_finished_logs()) == 0) def test_batch_log_record_processor_gc(self): # Given a BatchLogRecordProcessor @@ -677,4 +679,5 @@ def formatter(record): # pylint: disable=unused-argument mock_stdout = Mock() exporter = ConsoleLogExporter(out=mock_stdout, formatter=formatter) exporter.export([EMPTY_LOG]) + mock_stdout.write.assert_called_once_with(mock_record_str) From 953ad93b32e657bf3ec2755a765feb602da1bf1a Mon Sep 17 00:00:00 2001 From: Dylan Russell Date: Wed, 9 Apr 2025 18:35:31 +0000 Subject: [PATCH 21/21] Refactor BatchLogRecordProcessor --- .../sdk/_logs/_internal/export/__init__.py | 55 +++++++++++++++---- opentelemetry-sdk/tests/logs/test_export.py | 48 ++++++++++------ 2 files changed, 76 insertions(+), 27 deletions(-) diff --git a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py index 223e27d6af7..d6a53b805f1 100644 --- a/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py +++ b/opentelemetry-sdk/src/opentelemetry/sdk/_logs/_internal/export/__init__.py @@ -197,6 +197,7 @@ def __init__( BatchLogRecordProcessor._default_max_export_batch_size() ) # Not used. No way currently to pass timeout to export. + # Not used. No way currently to pass timeout to export. if export_timeout_millis is None: export_timeout_millis = ( BatchLogRecordProcessor._default_export_timeout_millis() @@ -208,21 +209,23 @@ def __init__( self._exporter = exporter self._max_queue_size = max_queue_size self._schedule_delay = schedule_delay_millis / 1e3 + self._schedule_delay = schedule_delay_millis / 1e3 self._max_export_batch_size = max_export_batch_size # Not used. No way currently to pass timeout to export. # TODO(https://github.com/open-telemetry/opentelemetry-python/issues/4555): figure out what this should do. + # Not used. No way currently to pass timeout to export. self._export_timeout_millis = export_timeout_millis # Deque is thread safe. + # Deque is thread safe. self._queue = collections.deque([], max_queue_size) self._worker_thread = threading.Thread( name="OtelBatchLogRecordProcessor", target=self.worker, daemon=True, ) - self._shutdown = False self._export_lock = threading.Lock() - self._worker_awaken = threading.Event() + self._worker_sleep = threading.Event() self._worker_thread.start() if hasattr(os, "register_at_fork"): weak_reinit = weakref.WeakMethod(self._at_fork_reinit) @@ -237,15 +240,15 @@ def _should_export_batch( # Always continue to export while queue length exceeds max batch size. if len(self._queue) >= self._max_export_batch_size: return True - if batch_strategy is BatchLogExportStrategy.EXPORT_ALL: + if batch_strategy == BatchLogExportStrategy.EXPORT_ALL: return True - if batch_strategy is BatchLogExportStrategy.EXPORT_AT_LEAST_ONE_BATCH: + if batch_strategy == BatchLogExportStrategy.EXPORT_AT_LEAST_ONE_BATCH: return num_iterations == 0 return False def _at_fork_reinit(self): self._export_lock = threading.Lock() - self._worker_awaken = threading.Event() + self._worker_sleep = threading.Event() self._queue.clear() self._worker_thread = threading.Thread( name="OtelBatchLogRecordProcessor", @@ -260,7 +263,7 @@ def worker(self): # Lots of strategies in the spec for setting next timeout. # https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/sdk.md#batching-processor. # Shutdown will interrupt this sleep. Emit will interrupt this sleep only if the queue is bigger then threshold. - sleep_interrupted = self._worker_awaken.wait(self._schedule_delay) + sleep_interrupted = self._worker_sleep.wait(self._schedule_delay) if self._shutdown: break self._export( @@ -268,9 +271,33 @@ def worker(self): if sleep_interrupted else BatchLogExportStrategy.EXPORT_AT_LEAST_ONE_BATCH ) - self._worker_awaken.clear() + self._worker_sleep.clear() self._export(BatchLogExportStrategy.EXPORT_ALL) + def _export(self, batch_strategy: BatchLogExportStrategy) -> None: + with self._export_lock: + iteration = 0 + # We could see concurrent export calls from worker and force_flush. We call _should_export_batch + # once the lock is obtained to see if we still need to make the requested export. + while self._should_export_batch(batch_strategy, iteration): + iteration += 1 + token = attach(set_value(_SUPPRESS_INSTRUMENTATION_KEY, True)) + try: + self._exporter.export( + [ + # Oldest records are at the back, so pop from there. + self._queue.pop() + for _ in range( + min( + self._max_export_batch_size, + len(self._queue), + ) + ) + ] + ) + except Exception: # pylint: disable=broad-exception-caught + _logger.exception("Exception while exporting logs.") + detach(token) def _export(self, batch_strategy: BatchLogExportStrategy) -> None: with self._export_lock: iteration = 0 @@ -298,24 +325,29 @@ def _export(self, batch_strategy: BatchLogExportStrategy) -> None: def emit(self, log_data: LogData) -> None: if self._shutdown: - _logger.info("Shutdown called, ignoring log.") + _logger.warning("Shutdown called, ignoring log.") return if self._pid != os.getpid(): _BSP_RESET_ONCE.do_once(self._at_fork_reinit) + if len(self._queue) == self._max_queue_size: + _logger.warning("Queue full, dropping log.") if len(self._queue) == self._max_queue_size: _logger.warning("Queue full, dropping log.") self._queue.appendleft(log_data) if len(self._queue) >= self._max_export_batch_size: - self._worker_awaken.set() + self._worker_sleep.set() def shutdown(self): + if self._shutdown: + return + # Prevents emit and force_flush from further calling export. if self._shutdown: return # Prevents emit and force_flush from further calling export. self._shutdown = True # Interrupts sleep in the worker, if it's sleeping. - self._worker_awaken.set() + self._worker_sleep.set() # Main worker loop should exit after one final export call with flush all strategy. self._worker_thread.join() self._exporter.shutdown() @@ -325,6 +357,9 @@ def force_flush(self, timeout_millis: Optional[int] = None) -> bool: return # Blocking call to export. self._export(BatchLogExportStrategy.EXPORT_ALL) + return + # Blocking call to export. + self._export(BatchLogExportStrategy.EXPORT_ALL) @staticmethod def _default_max_queue_size(): diff --git a/opentelemetry-sdk/tests/logs/test_export.py b/opentelemetry-sdk/tests/logs/test_export.py index 6640246f4bc..dc184ba5485 100644 --- a/opentelemetry-sdk/tests/logs/test_export.py +++ b/opentelemetry-sdk/tests/logs/test_export.py @@ -54,6 +54,11 @@ instrumentation_scope=InstrumentationScope("example", "example"), ) +EMPTY_LOG = LogData( + log_record=LogRecord(), + instrumentation_scope=InstrumentationScope("example", "example"), +) + class TestSimpleLogRecordProcessor(unittest.TestCase): def test_simple_log_record_processor_default_level(self): @@ -332,6 +337,7 @@ def test_simple_log_record_processor_different_msg_types_with_formatter( self.assertEqual(expected, emitted) +class TestBatchLogRecordProcessor(unittest.TestCase): class TestBatchLogRecordProcessor(unittest.TestCase): def test_emit_call_log_record(self): exporter = InMemoryLogExporter() @@ -358,6 +364,7 @@ def test_args(self): self.assertEqual(log_record_processor._exporter, exporter) self.assertEqual(log_record_processor._max_queue_size, 1024) self.assertEqual(log_record_processor._schedule_delay, 2.5) + self.assertEqual(log_record_processor._schedule_delay, 2.5) self.assertEqual(log_record_processor._max_export_batch_size, 256) self.assertEqual(log_record_processor._export_timeout_millis, 15000) @@ -376,6 +383,7 @@ def test_env_vars(self): self.assertEqual(log_record_processor._exporter, exporter) self.assertEqual(log_record_processor._max_queue_size, 1024) self.assertEqual(log_record_processor._schedule_delay, 2.5) + self.assertEqual(log_record_processor._schedule_delay, 2.5) self.assertEqual(log_record_processor._max_export_batch_size, 256) self.assertEqual(log_record_processor._export_timeout_millis, 15000) @@ -385,6 +393,7 @@ def test_args_defaults(self): self.assertEqual(log_record_processor._exporter, exporter) self.assertEqual(log_record_processor._max_queue_size, 2048) self.assertEqual(log_record_processor._schedule_delay, 5) + self.assertEqual(log_record_processor._schedule_delay, 5) self.assertEqual(log_record_processor._max_export_batch_size, 512) self.assertEqual(log_record_processor._export_timeout_millis, 30000) @@ -405,6 +414,7 @@ def test_args_env_var_value_error(self): self.assertEqual(log_record_processor._exporter, exporter) self.assertEqual(log_record_processor._max_queue_size, 2048) self.assertEqual(log_record_processor._schedule_delay, 5) + self.assertEqual(log_record_processor._schedule_delay, 5) self.assertEqual(log_record_processor._max_export_batch_size, 512) self.assertEqual(log_record_processor._export_timeout_millis, 30000) @@ -420,6 +430,7 @@ def test_args_none_defaults(self): self.assertEqual(log_record_processor._exporter, exporter) self.assertEqual(log_record_processor._max_queue_size, 2048) self.assertEqual(log_record_processor._schedule_delay, 5) + self.assertEqual(log_record_processor._schedule_delay, 5) self.assertEqual(log_record_processor._max_export_batch_size, 512) self.assertEqual(log_record_processor._export_timeout_millis, 30000) @@ -486,20 +497,24 @@ def test_logs_exported_once_batch_size_reached(self): exporter.export.assert_called_once() after_export = time.time_ns() # Shows the worker's 30 second sleep was interrupted within a second. - self.assertLess(after_export - before_export, 1e9) + self.assertTrue((after_export - before_export) < 1e9) # pylint: disable=no-self-use def test_logs_exported_once_schedule_delay_reached(self): exporter = Mock() log_record_processor = BatchLogRecordProcessor( exporter=exporter, + # Should not reach this during the test, instead export should be called when delay millis is hit. max_queue_size=15, max_export_batch_size=15, schedule_delay_millis=100, ) - log_record_processor.emit(EMPTY_LOG) - time.sleep(0.2) - exporter.export.assert_called_once_with([EMPTY_LOG]) + for _ in range(15): + log_record_processor.emit(EMPTY_LOG) + time.sleep(0.11) + exporter.export.assert_has_calls( + [call([EMPTY_LOG]) for _ in range(15)] + ) def test_logs_flushed_before_shutdown_and_dropped_after_shutdown(self): exporter = Mock() @@ -516,13 +531,13 @@ def test_logs_flushed_before_shutdown_and_dropped_after_shutdown(self): exporter.export.assert_called_once_with([EMPTY_LOG]) self.assertTrue(exporter._stopped) - with self.assertLogs(level="INFO") as log: + with self.assertLogs(level="WARNING") as log: # This log should not be flushed. log_record_processor.emit(EMPTY_LOG) self.assertEqual(len(log.output), 1) self.assertEqual(len(log.records), 1) self.assertIn("Shutdown called, ignoring log.", log.output[0]) - exporter.export.assert_called_once() + exporter.export.assert_called_once_with([EMPTY_LOG]) # pylint: disable=no-self-use def test_force_flush_flushes_logs(self): @@ -538,6 +553,7 @@ def test_force_flush_flushes_logs(self): log_record_processor.emit(EMPTY_LOG) log_record_processor.force_flush() exporter.export.assert_called_once_with([EMPTY_LOG for _ in range(10)]) + exporter.export.assert_called_once_with([EMPTY_LOG for _ in range(10)]) def test_with_multiple_threads(self): exporter = InMemoryLogExporter() @@ -547,11 +563,12 @@ def bulk_log_and_flush(num_logs): for _ in range(num_logs): log_record_processor.emit(EMPTY_LOG) log_record_processor.force_flush() + log_record_processor.emit(EMPTY_LOG) + log_record_processor.force_flush() with ThreadPoolExecutor(max_workers=69) as executor: for idx in range(69): executor.submit(bulk_log_and_flush, idx + 1) - executor.shutdown() finished_logs = exporter.get_finished_logs() @@ -561,20 +578,17 @@ def bulk_log_and_flush(num_logs): hasattr(os, "fork"), "needs *nix", ) - def test_batch_log_record_processor_fork_clears_logs_from_child(self): + def test_batch_log_record_processor_fork(self): exporter = InMemoryLogExporter() log_record_processor = BatchLogRecordProcessor( exporter, max_export_batch_size=64, schedule_delay_millis=30000, + schedule_delay_millis=30000, ) - # These logs should be flushed only from the parent process. - # _at_fork_reinit should be called in the child process, to - # clear these logs in the child process. + # These are not expected to be flushed. Calling fork clears any logs not flushed. for _ in range(10): log_record_processor.emit(EMPTY_LOG) - - # The below test also needs this, but it can only be set once. multiprocessing.set_start_method("fork") def child(conn): @@ -604,10 +618,8 @@ def test_batch_log_record_processor_fork_doesnot_deadlock(self): ) def child(conn): - def _target(): + for _ in range(100): log_record_processor.emit(EMPTY_LOG) - - ConcurrencyTestBase.run_with_many_threads(_target, 100) log_record_processor.force_flush() logs = exporter.get_finished_logs() conn.send(len(logs) == 100) @@ -616,8 +628,11 @@ def _target(): parent_conn, child_conn = multiprocessing.Pipe() process = multiprocessing.Process(target=child, args=(child_conn,)) process.start() + process = multiprocessing.Process(target=child, args=(child_conn,)) + process.start() self.assertTrue(parent_conn.recv()) process.join() + self.assertTrue(len(exporter.get_finished_logs()) == 0) def test_batch_log_record_processor_gc(self): # Given a BatchLogRecordProcessor @@ -679,5 +694,4 @@ def formatter(record): # pylint: disable=unused-argument mock_stdout = Mock() exporter = ConsoleLogExporter(out=mock_stdout, formatter=formatter) exporter.export([EMPTY_LOG]) - mock_stdout.write.assert_called_once_with(mock_record_str)