neuralmagic
diff --git a/‎src/deepsparse/v2/operators/engine_operator.py
+7-5 b/‎src/deepsparse/v2/operators/engine_operator.py
+7-5
diff --git a/‎src/deepsparse/v2/operators/operator.py
+1-4 b/‎src/deepsparse/v2/operators/operator.py
+1-4
diff --git a/‎src/deepsparse/v2/pipeline.py
+138-86 b/‎src/deepsparse/v2/pipeline.py
+138-86
diff --git a/‎src/deepsparse/v2/routers/router.py
-2 b/‎src/deepsparse/v2/routers/router.py
-2
@@ -20,7 +20,7 @@
 from deepsparse import Context as EngineContext
 from deepsparse import Engine, MultiModelEngine, Scheduler
 from deepsparse.benchmark import ORTEngine
-from deepsparse.utils import model_to_path
+from deepsparse.utils import join_engine_outputs, model_to_path, split_engine_inputs
 from deepsparse.v2.operators import Operator
 
 
@@ -29,12 +29,12 @@
 
 SUPPORTED_PIPELINE_ENGINES = [DEEPSPARSE_ENGINE, ORT_ENGINE]
 
-__all__ = ["EngineOperator"]
+__all__ = ["EngineOperator", "EngineOperatorInputs", "EngineOperatorOutputs"]
 
 
 class EngineOperatorInputs(BaseModel):
     engine_inputs: List = Field(description="engine_inputs")
-    engine: Optional[Engine] = Field(
+    engine: Optional[Union[ORTEngine, Engine]] = Field(
         description="override the engine to run forward pass with",
         default=None,
     )
@@ -95,8 +95,8 @@ def __init__(
         engine_kwargs: Dict = None,
     ):
         self.model_path = model_to_path(model_path)
-        self._batch_size = 1
         self.engine_context = engine_context
+        self._batch_size = 1
 
         if self.engine_context is not None:
             num_cores = num_cores or self.engine_context.num_cores
@@ -131,6 +131,7 @@ def batch_size(self) -> int:
         """
         return self._batch_size
 
+    # TODO: maybe add a few args to make this less opaque?
     def create_engine(
         self,
         **kwargs,
@@ -142,7 +143,8 @@ def create_engine(
             constructor/compilation
         :return: inference engine
         """
-        onnx_file_path = self.model_path
+
+        onnx_file_path = kwargs.pop("model_path", self.model_path)
         engine_args = deepcopy(self._engine_args)
         engine_args.update(kwargs)
         engine_type = self._engine_type.lower()
 
@@ -17,7 +17,7 @@
 
 from pydantic import BaseModel
 
-from deepsparse.v2.utils import InferenceState, PipelineState
+from deepsparse.v2.utils import InferenceState
 
 
 __all__ = ["Operator"]
@@ -57,7 +57,6 @@ def __call__(
         self,
         *args,
         inference_state: InferenceState,
-        pipeline_state: PipelineState,
         **kwargs,
     ) -> Any:
         """
@@ -90,13 +89,11 @@ def __call__(
             run_output = self.run(
                 inference_input,
                 inference_state=inference_state,
-                pipeline_state=pipeline_state,
             )
         else:
             run_output = self.run(
                 *args,
                 inference_state=inference_state,
-                pipeline_state=pipeline_state,
                 **kwargs,
             )
         if self.has_output_schema():
 
@@ -15,13 +15,18 @@
 
 import copy
 from concurrent.futures import Future
-from functools import partial
-from typing import Any, Callable, Dict, List, Union
+from typing import Any, Dict, List, Union
 
-from deepsparse.v2.operators import Operator
+from deepsparse.v2.operators import EngineOperator, Operator
 from deepsparse.v2.routers import Router
-from deepsparse.v2.schedulers import OperatorScheduler, SchedulerGroup
+from deepsparse.v2.schedulers import (
+    ContinuousBatchingScheduler,
+    OperatorScheduler,
+    SchedulerGroup,
+)
 from deepsparse.v2.utils import InferenceState, PipelineState
+from deepsparse.v2.utils.data import SubGraph
+from deepsparse.v2.utils.helpers import run_func
 
 
 __all__ = ["Pipeline"]
@@ -50,39 +55,100 @@ def __init__(
         ops: Union[Dict[str, Operator], List[Operator]],
         router: Router,
         schedulers: List[OperatorScheduler],
+        continuous_batching_scheduler: ContinuousBatchingScheduler,
         pipeline_state: PipelineState = None,
     ):
 
         self.ops = ops
         self.router = router
         self.schedulers = schedulers
         self.pipeline_state = pipeline_state
+        self._continuous_batching_scheduler = continuous_batching_scheduler
         self.validate()
 
         self._scheduler_group = SchedulerGroup(self.schedulers)
 
-    def _run_sequential(
+    def _run_next(
         self,
         inp: Any,
         inference_state: InferenceState,
-        pipeline_state: PipelineState,
-        start: str,
-        end: str,
+        next_step: str,
     ):
-        next_step = start
-        while next_step != end:
-            outputs = self._run_next_step(
-                func=self.ops[next_step],
-                next_step=next_step,
-                input=inp,
-                pipeline_state=pipeline_state,
-                inference_state=inference_state,
+        if (
+            isinstance(self.ops[next_step], EngineOperator)
+            and self._continuous_batching_scheduler
+        ):
+            func = self._continuous_batching_scheduler.submit
+            inp = self.ops[next_step].input_schema(**inp)
+        else:
+            func = self._scheduler_group.submit
+
+        return run_func(
+            func=func,
+            operator=self.ops[next_step],
+            inp=inp,
+            pipeline_state=self.pipeline_state,
+            inference_state=inference_state,
+        )
+
+    def _run_sub_graphs(
+        self, sub_graph_inputs: List[Any], sub_graphs: List[SubGraph]
+    ) -> List[Any]:
+        """
+        Run a list of sub_graphs asynchronously. Polls to identify the sub graph that is
+        still running but has completed its current step. Schedules the next step
+        subgraph step. This is repeated until all subgraphs have finished running and
+        have reached their end step (stored in the Subgraph.end attribute).
+
+        :param sub_graph_inputs: A list of inputs that should be passed to each
+        subgraph. Each subgraph is given an element of the list as input to its
+        first node.
+        :param sub_graphs: A list of Subgraph objects. Each stores the relevant
+        execution information for the particular subgraph, such as its current step
+        in the sub graph, inference state, output, and end step.
+
+        :returns: a list of outputs for all the completed Subgraph objects. Returned
+        in the same order that the subgraphs were passed to the function.
+        """
+        for i in range(len(sub_graphs)):
+            sub_graphs[i].output = self._run_next(
+                sub_graph_inputs[i], sub_graphs[i].inf, sub_graphs[i].step
             )
-            next_step, operator_output, state_update = outputs
-            if state_update:
-                inference_state.update_state(state_update)
-            inp = operator_output
-        return inp
+
+        # Execute all sub graphs until all graphs have been completed.
+        while True:
+            for sub_graph in sub_graphs:
+                if isinstance(sub_graph.output, Future) and sub_graph.output.done():
+                    # get the result for the completed operator; resolve its output
+                    operator_output = sub_graph.output.result()
+                    operator_output = sub_graph.parse_output(operator_output)
+
+                    # determine the next step for the particular operator, using
+                    # its previous output and previously stored step
+                    next_step = self.router.next(
+                        sub_graph.step, self.ops, operator_output
+                    )
+                    # update the step
+                    sub_graph.step = next_step
+
+                    # store the output for the next step. If the next step is
+                    # end step, this particular route has completed. Simply
+                    # update the output value
+                    if next_step in sub_graph.end:
+                        sub_graph.output = operator_output
+                    else:
+                        sub_graph.output = self._run_next(
+                            inp=operator_output,
+                            inference_state=sub_graph.inf,
+                            next_step=next_step,
+                        )
+                    break
+
+            # keep running until all sub graphs have completed.
+            if not any(isinstance(x.output, Future) for x in sub_graphs):
+                break
+
+        return [x.output for x in sub_graphs]
 
     def _apply_split(self, inp: Any, inference_state: InferenceState):
         """
@@ -93,59 +159,29 @@ def _apply_split(self, inp: Any, inference_state: InferenceState):
         """
 
         batches, orig_batch_size = self.expand_inputs(inp, 1)
-        run_with_state = partial(
-            self._run_sequential,
-            pipeline_state=self.pipeline_state,
-            start=self.router.route[self.router.SPLIT_ROUTE],
-            end=self.router.JOIN_ROUTE,
-        )
-        inference_state_list = [
-            copy.deepcopy(inference_state) for x in range(len(batches))
-        ]
-        futures = self._scheduler_group.map(
-            batches,
-            inference_state_list,
-            func=run_with_state,
-        )
-        return self.condense_inputs([x.result() for x in futures])
 
-    def _run_next_step(
-        self,
-        *args,
-        func: Callable,
-        next_step: Union[str, int],
-        input: Any = None,
-        **kwargs,
-    ):
-        """
-        Generic function to run a given func, process the output and determine the next
-        step.
-        """
-        if input:
-            operator_output = (
-                func(*args, **kwargs, **input)
-                if isinstance(input, dict)
-                else func(input, *args, **kwargs)
+        # Create a list of SplitRoutes, per batch size 1
+        # Each SplitRoute object holds information about the particular path it
+        # follows. All start at the same step defined by SPLIT_ROUTE and start
+        # with the same inference_state.
+        split_graphs = [
+            SubGraph(
+                inf=copy.deepcopy(inference_state),
+                step=self.router.route[self.router.SPLIT_ROUTE],
+                end=[self.router.JOIN_ROUTE],
             )
-        else:
-            operator_output = func(*args, **kwargs)
-
-        if isinstance(operator_output, Future):
-            operator_output = operator_output.result()
-
-        state_update = None
-        if isinstance(operator_output, tuple):
-            state_update = operator_output[-1]
-            operator_output = operator_output[0]
+            for i in range(len(batches))
+        ]
 
-        next_step = self.router.next(next_step, self.ops, operator_output)
-        return next_step, operator_output, state_update
+        outputs = self._run_sub_graphs(
+            sub_graph_inputs=batches, sub_graphs=split_graphs
+        )
+        return self.condense_inputs(outputs)
 
     def run(
         self,
         *args,
         inference_state: InferenceState,
-        pipeline_state: PipelineState,
         **kwargs,
     ):
         """
@@ -158,36 +194,56 @@ def run(
         """
         next_step = self.router.START_ROUTE
         operator_output = None
-
         while next_step != self.router.END_ROUTE:
+
+            # Split Grap Execution (i.e multiple subgraphs)
             # NOTE: split_route should only appear after the start route node
             if next_step == self.router.SPLIT_ROUTE:
+                if operator_output is None:
+                    raise ValueError(
+                        f"{self.router.SPLIT_ROUTE} should appear after "
+                        f"{self.ROUTER.START_ROUTE}"
+                    )
+
                 operator_output = self._apply_split(operator_output, inference_state)
                 next_step = self.router.route[self.router.JOIN_ROUTE]
+                if next_step == self.router.END_ROUTE:
+                    return operator_output
 
             if next_step == self.router.START_ROUTE:
-                outputs = self._run_next_step(
+                operator_output = run_func(
                     *args,
-                    next_step=next_step,
                     func=self._scheduler_group.submit,
-                    inference_state=inference_state,
                     operator=self.ops[next_step],
-                    pipeline_state=pipeline_state,
+                    inference_state=inference_state,
+                    pipeline_state=self.pipeline_state,
                     **kwargs,
-                )
+                ).result()
+
+                if isinstance(operator_output, tuple):
+                    operator_output, state_update = (
+                        operator_output[0],
+                        operator_output[-1],
+                    )
+                    inference_state.update_state(state_update)
+
+                next_step = self.router.next(next_step, self.ops, operator_output)
+
             else:
-                outputs = self._run_next_step(
-                    func=self._scheduler_group.submit,
-                    input=operator_output,
-                    next_step=next_step,
-                    inference_state=inference_state,
-                    operator=self.ops[next_step],
-                    pipeline_state=pipeline_state,
+                # Single graph execution
+                graph = SubGraph(
+                    inf=copy.deepcopy(inference_state),
+                    step=next_step,
+                    end=[self.router.SPLIT_ROUTE, self.router.END_ROUTE],
                 )
 
-            next_step, operator_output, state_update = outputs
-            if state_update:
-                inference_state.update_state(state_update)
+                operator_output = self._run_sub_graphs(
+                    sub_graph_inputs=[operator_output], sub_graphs=[graph]
+                )[0]
+
+                inference_state = graph.inf
+                next_step = graph.step
+
         return operator_output
 
     def __call__(self, *args, **kwargs):
@@ -204,11 +260,7 @@ def __call__(self, *args, **kwargs):
             inference_state = InferenceState()
             inference_state.create_state({})
 
-        if "pipeline_state" in kwargs:
-            self.pipeline_state = kwargs.get("pipeline_state")
-
         kwargs["inference_state"] = inference_state
-        kwargs["pipeline_state"] = self.pipeline_state
 
         return self.run(*args, **kwargs)
 
 
@@ -83,8 +83,6 @@ class LinearRouter(Router):
 
     def __init__(self, end_route: int, start_route: int = 0):
         super().__init__(end_route=end_route, start_route=start_route)
-        self.SPLIT_ROUTE = None
-        self.JOIN_ROUTE = None
         _LOGGER.warn("SPLIT and JOIN are not yet supported for the LinearRouter.")
 
     def next(