13
13
14
14
#include < yt/cpp/mapreduce/interface/logging/logger.h>
15
15
16
+ #include < util/string/split.h>
17
+
16
18
using namespace NActors ;
17
19
20
+ TVector<std::pair<TString, TString>> GetJobFiles (TVector<TString> udfs) {
21
+ TVector<std::pair<TString, TString>> result;
22
+
23
+ for (const TString& udf: udfs) {
24
+ TVector<TString> splitResult;
25
+ Split (udf.data (), " /" , splitResult);
26
+ while (!splitResult.empty () && splitResult.back ().empty ()) {
27
+ splitResult.pop_back ();
28
+ }
29
+
30
+ Y_ENSURE (!splitResult.empty ());
31
+
32
+ result.push_back (std::make_pair (udf, splitResult.back ()));
33
+ }
34
+
35
+ return result;
36
+ }
37
+
18
38
class TQueryReplayMapper
19
39
: public NYT::IMapper<NYT::TTableReader<NYT::TNode>, NYT::TTableWriter<NYT::TNode>>
20
40
{
@@ -25,7 +45,8 @@ class TQueryReplayMapper
25
45
TIntrusivePtr<NKikimr::NMiniKQL::IMutableFunctionRegistry> FunctionRegistry;
26
46
TIntrusivePtr<NKikimr::NKqp::TModuleResolverState> ModuleResolverState;
27
47
28
- TQueryReplayConfig Config;
48
+ TVector<TString> UdfFiles;
49
+ ui32 ActorSystemThreadsCount = 5 ;
29
50
30
51
TString GetFailReason (const TQueryReplayEvents::TCheckQueryPlanStatus& status) {
31
52
switch (status) {
@@ -58,16 +79,30 @@ class TQueryReplayMapper
58
79
59
80
public:
60
81
TQueryReplayMapper () = default ;
61
- TQueryReplayMapper (const TQueryReplayConfig& config) : Config(config) {
62
- }
82
+
83
+ Y_SAVELOAD_JOB (UdfFiles, ActorSystemThreadsCount);
84
+
85
+ TQueryReplayMapper (TVector<TString> udfFiles, ui32 actorSystemThreadsCount)
86
+ : UdfFiles(udfFiles)
87
+ , ActorSystemThreadsCount(actorSystemThreadsCount)
88
+ {}
63
89
64
90
void Start (NYT::TTableWriter<NYT::TNode>*) override {
65
91
TypeRegistry.Reset (new NKikimr::NScheme::TKikimrTypeRegistry ());
66
92
FunctionRegistry.Reset (NKikimr::NMiniKQL::CreateFunctionRegistry (NKikimr::NMiniKQL::CreateBuiltinRegistry ())->Clone ());
67
93
NKikimr::NMiniKQL::FillStaticModules (*FunctionRegistry);
94
+ NKikimr::NMiniKQL::TUdfModuleRemappings remappings;
95
+ THashSet<TString> usedUdfPaths;
96
+
97
+ for (const auto & [_, udfPath]: GetJobFiles (UdfFiles)) {
98
+ if (usedUdfPaths.insert (udfPath).second ) {
99
+ FunctionRegistry->LoadUdfs (udfPath, remappings, 0 );
100
+ }
101
+ }
102
+
68
103
AppData.Reset (new NKikimr::TAppData (0 , 0 , 0 , 0 , {}, TypeRegistry.Get (), FunctionRegistry.Get (), nullptr , nullptr ));
69
104
AppData->Counters = MakeIntrusive<NMonitoring::TDynamicCounters>(new NMonitoring::TDynamicCounters ());
70
- auto setup = BuildActorSystemSetup (Config. ActorSystemThreadsCount );
105
+ auto setup = BuildActorSystemSetup (ActorSystemThreadsCount);
71
106
ActorSystem.Reset (new TActorSystem (setup, AppData.Get ()));
72
107
ActorSystem->Start ();
73
108
ActorSystem->Register (NKikimr::NKqp::CreateKqpResourceManagerActor ({}, nullptr ));
@@ -164,9 +199,17 @@ int main(int argc, const char** argv) {
164
199
NYT::TMapOperationSpec spec;
165
200
spec.AddInput <NYT::TNode>(config.SrcPath );
166
201
spec.AddOutput <NYT::TNode>(NYT::TRichYPath (config.DstPath ).Schema (OutputSchema ()));
167
- spec.MapperSpec (NYT::TUserJobSpec ().MemoryLimit (5_GB));
168
202
169
- client->Map (spec, new TQueryReplayMapper (config));
203
+ auto userJobSpec = NYT::TUserJobSpec ();
204
+ userJobSpec.MemoryLimit (1_GB);
205
+
206
+ for (const auto & [udf, udfInJob]: GetJobFiles (config.UdfFiles )) {
207
+ userJobSpec.AddLocalFile (udf, NYT::TAddLocalFileOptions ().PathInJob (udfInJob));
208
+ }
209
+
210
+ spec.MapperSpec (userJobSpec);
211
+
212
+ client->Map (spec, new TQueryReplayMapper (config.UdfFiles , config.ActorSystemThreadsCount ));
170
213
171
214
return EXIT_SUCCESS;
172
215
}
0 commit comments