-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathframework_pipeline.py
280 lines (255 loc) · 12.2 KB
/
framework_pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
import json
import os
from typing import Any, Dict, List, Union, Optional, Mapping
from helixtelemetry.telemetry.context.telemetry_context import TelemetryContext
from helixtelemetry.telemetry.factory.telemetry_factory import TelemetryFactory
from helixtelemetry.telemetry.providers.null_telemetry import NullTelemetry
from helixtelemetry.telemetry.providers.open_telemetry import OpenTelemetry
from helixtelemetry.telemetry.spans.telemetry_span_creator import TelemetrySpanCreator
from helixtelemetry.telemetry.spans.telemetry_span_wrapper import TelemetrySpanWrapper
from helixtelemetry.telemetry.structures.telemetry_attribute_value import (
TelemetryAttributeValue,
)
from helixtelemetry.telemetry.structures.telemetry_parent import TelemetryParent
from pyspark.ml.base import Transformer
from pyspark.sql.dataframe import DataFrame
from spark_pipeline_framework.logger.log_level import LogLevel
from spark_pipeline_framework.logger.yarn_logger import get_logger
from spark_pipeline_framework.mixins.loop_id_mixin import LoopIdMixin
from spark_pipeline_framework.mixins.telemetry_parent_mixin import TelemetryParentMixin
from spark_pipeline_framework.progress_logger.progress_log_metric import (
ProgressLogMetric,
)
from spark_pipeline_framework.progress_logger.progress_logger import ProgressLogger
from spark_pipeline_framework.transformers.framework_transformer.v1.framework_transformer import (
FrameworkTransformer,
)
from spark_pipeline_framework.utilities.async_helper.v1.async_helper import AsyncHelper
from spark_pipeline_framework.utilities.class_helpers import ClassHelpers
from spark_pipeline_framework.utilities.pipeline_helper import create_steps
class FrameworkPipeline(Transformer, LoopIdMixin, TelemetryParentMixin):
def __init__(
self,
parameters: Dict[str, Any],
progress_logger: ProgressLogger,
run_id: Optional[str] = None,
log_level: Optional[Union[int, str]] = None,
telemetry_enable: Optional[bool] = None,
telemetry_context: Optional[TelemetryContext] = None,
name: Optional[str] = None,
attributes: Optional[Mapping[str, TelemetryAttributeValue]] = None,
telemetry_parent: Optional[TelemetryParent] = None,
) -> None:
"""
Base class for all pipelines
:param parameters:
:param progress_logger:
"""
super(FrameworkPipeline, self).__init__()
self.transformers: List[Transformer] = []
self._run_id: Optional[str] = run_id
self.steps: List[Union[Transformer, List[Transformer]]] = []
self.__parameters: Dict[str, Any] = parameters
self.progress_logger: ProgressLogger = progress_logger
self.loop_id: Optional[str] = None
self.log_level: Optional[Union[int, str]] = log_level or os.environ.get(
"LOGLEVEL"
)
self.telemetry_enable: Optional[bool] = telemetry_enable or bool(
os.environ.get("TELEMETRY_ENABLE")
)
if telemetry_parent:
self.telemetry_parent = telemetry_parent
else:
self.set_telemetry_parent(
telemetry_parent=TelemetryParent(
name=name or self.__class__.__qualname__,
trace_id=None,
span_id=None,
telemetry_context=(
telemetry_context
or TelemetryContext(
provider=(
OpenTelemetry.telemetry_provider
if self.telemetry_enable
else NullTelemetry.telemetry_provider
),
service_name=os.getenv(
"OTEL_SERVICE_NAME", "helix-pipelines"
),
environment=os.getenv("ENV", "development"),
attributes=attributes,
log_level=log_level,
instance_name=os.getenv(
"OTEL_INSTANCE_NAME",
self.parameters.get("flow_run_name", "unknown"),
),
service_namespace=os.getenv(
"OTEL_SERVICE_NAMESPACE", "helix-pipelines"
),
)
),
attributes=attributes,
)
)
self.name: Optional[str] = name
self.attributes: Mapping[str, TelemetryAttributeValue] = attributes or {}
self.attributes = {k: v for k, v in self.attributes.items()} | {
"run_id": self._run_id
}
@property
def parameters(self) -> Dict[str, Any]:
return self.__parameters
# noinspection PyUnusedLocal
def fit(self, df: DataFrame) -> "FrameworkPipeline":
return self
def _transform(self, df: DataFrame) -> DataFrame:
"""
Override this method to implement transformation
:param df: input dataframe
:return: transformed dataframe
"""
return AsyncHelper.run(self._transform_async(df))
async def _transform_async(self, df: DataFrame) -> DataFrame:
"""
Runs all the transformers in the pipeline in sequence on the input DataFrame and returns the transformed DataFrame
"""
telemetry_span_creator: TelemetrySpanCreator = TelemetryFactory(
telemetry_parent=self.telemetry_parent or TelemetryParent.get_null_parent()
).create_telemetry_span_creator(log_level=self.log_level)
telemetry_span: TelemetrySpanWrapper
async with telemetry_span_creator.create_telemetry_span_async(
name=self.name or self.__class__.__qualname__,
attributes=self.attributes,
telemetry_parent=self.telemetry_parent,
) as telemetry_span:
# if steps are defined but not transformers then convert steps to transformers first
if len(self.steps) > 0 and len(self.transformers) == 0:
self.transformers = self.create_steps(self.steps)
# get the logger to use
logger = get_logger(__name__)
count_of_transformers: int = len(self.transformers)
i: int = 0
pipeline_name: str = self.__class__.__name__
self.progress_logger.log_event(
event_name=pipeline_name,
event_text=(
f"Starting Pipeline {pipeline_name}" + f"_{self._run_id}"
if self._run_id
else ""
),
log_level=LogLevel.INFO,
)
for transformer in self.transformers:
assert isinstance(transformer, Transformer), type(transformer)
if hasattr(transformer, "getName"):
# noinspection Mypy
stage_name = (
f"{transformer.getName()} ({transformer.__class__.__name__})"
)
else:
stage_name = transformer.__class__.__name__
transformer_span: TelemetrySpanWrapper
async with telemetry_span_creator.create_telemetry_span_async(
name=stage_name,
attributes={
"loop_id": self.loop_id,
},
telemetry_parent=telemetry_span.create_child_telemetry_parent(),
) as transformer_span:
try:
i += 1
logger.info(
f"---- Running pipeline [{pipeline_name}] transformer [{stage_name}] "
f"({i} of {count_of_transformers}) ----"
)
if isinstance(transformer, LoopIdMixin):
transformer.set_loop_id(self.loop_id)
if isinstance(transformer, TelemetryParentMixin):
transformer.set_telemetry_parent(
telemetry_parent=transformer_span.create_child_telemetry_parent()
)
with ProgressLogMetric(
progress_logger=self.progress_logger,
name=str(stage_name) or "unknown",
):
self.progress_logger.log_event(
pipeline_name,
event_text=f"Running pipeline step {stage_name}",
)
if hasattr(transformer, "_transform_async"):
# noinspection PyProtectedMember
df = await transformer._transform_async(df=df)
else:
df = transformer.transform(dataset=df)
except Exception as e:
logger.error(
f"!!!!!!!!!!!!! pipeline [{pipeline_name}] transformer [{stage_name}] threw exception !!!!!!!!!!!!!"
)
# use exception chaining to add stage name but keep original exception
# friendly_spark_exception: FriendlySparkException = (
# FriendlySparkException(exception=e, stage_name=stage_name)
# )
# error_messages: List[str] = (
# friendly_spark_exception.message.split("\n")
# if friendly_spark_exception.message
# else []
# )
# for error_message in error_messages:
# logger.error(msg=error_message)
if hasattr(transformer, "getSql"):
# noinspection Mypy
logger.error(transformer.getSql())
logger.error(
"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
)
self.progress_logger.log_exception(
event_name=pipeline_name,
event_text=f"Exception in Stage={stage_name}",
ex=e,
)
# if hasattr(e, "message"):
# e.message = f"Exception in stage {stage_name}" + e.message
if len(e.args) >= 1:
# e.args = (e.args[0] + f" in stage {stage_name}") + e.args[1:]
e.args = (f"In Stage ({stage_name})", *e.args)
raise e
self.progress_logger.log_event(
event_name=pipeline_name,
event_text=f"Finished Pipeline {pipeline_name}",
log_level=LogLevel.INFO,
)
await telemetry_span_creator.flush_async()
return df
# noinspection PyMethodMayBeStatic
def create_steps(
self,
my_list: Union[
List[Transformer],
List[FrameworkTransformer],
List[Union[Transformer, List[Transformer]]],
List[Union[FrameworkTransformer, List[FrameworkTransformer]]],
# List[DefaultParamsReadable[Any]],
],
) -> List[Transformer]:
return create_steps(my_list)
def finalize(self) -> None:
pass
def as_dict(self) -> Dict[str, Any]:
return {
"short_type": self.__class__.__name__,
"type": ClassHelpers.get_full_name_of_instance(self),
# self.parameters is a subclass of dict so json.dumps thinks it can't serialize it
"params": {
k: v if not hasattr(v, "as_dict") else v.as_dict()
for k, v in self.parameters.items()
},
"steps": [
s.as_dict() if not isinstance(s, list) else [s1.as_dict() for s1 in s] # type: ignore
for s in self.steps
],
}
def __str__(self) -> str:
return json.dumps(self.as_dict(), default=str)
async def transform_async(self, df: DataFrame) -> DataFrame:
return await self._transform_async(df=df)