Whisper support via FasterWhisper

zenoverflow · zenoverflow · commit 9833412f8f31 · 2024-09-04T09:00:13.000Z
diff --git a/modules/faster_whisper/__init__.py b/modules/faster_whisper/__init__.py
@@ -0,0 +1,17 @@
+from fastapi import FastAPI
+
+from modules.faster_whisper.load import faster_whisper_load
+from modules.faster_whisper.unload import faster_whisper_unload
+from modules.faster_whisper.action import faster_whisper_action
+
+
+def setup_faster_whisper(app: FastAPI) -> None:
+    """
+    Setup FasterWhisper routes.
+    """
+
+    app.post("/faster_whisper/load/")(faster_whisper_load)
+
+    app.post("/faster_whisper/unload")(faster_whisper_unload)
+
+    app.post("/faster_whisper/action")(faster_whisper_action)
diff --git a/modules/faster_whisper/action.py b/modules/faster_whisper/action.py
@@ -0,0 +1,27 @@
+from modules.state import get_inference
+from modules.faster_whisper.inference import FasterWhisperInference, inference_name
+
+from pydantic import BaseModel
+
+
+class FasterWhisperInferenceData(BaseModel):
+    """
+    Task schema for FasterWhisper actions.
+
+    Attributes:
+    - audio: str: The audio to transcribe (base64 encoded)
+    """
+
+    audio: str
+
+
+async def faster_whisper_action(data: FasterWhisperInferenceData):
+    """
+    Use FasterWhisper to transcribe audio.
+    """
+
+    inference: FasterWhisperInference = get_inference(inference_name)
+
+    result = inference.inference(data.audio)
+
+    return result
diff --git a/modules/faster_whisper/inference.py b/modules/faster_whisper/inference.py
@@ -0,0 +1,49 @@
+import base64
+from io import BytesIO
+
+import torch
+from faster_whisper import WhisperModel
+
+
+class FasterWhisperInference:
+    def __init__(
+        self,
+        model: str,
+        device: str,
+    ):
+        if device == "cuda":
+            if not torch.cuda.is_available():
+                raise ValueError("CUDA is not available on this device.")
+            else:
+                self.device = "cuda"
+        else:
+            self.device = "cpu"
+
+        self.compute_type = "float16" if self.device == "cuda" else "float32"
+
+        self.model = WhisperModel(
+            model,
+            device=self.device,
+            compute_type=self.compute_type,
+        )
+
+    def __del__(self):
+        del self.model
+        try:
+            torch.cuda.empty_cache()
+        except:
+            pass
+
+    def inference(
+        self,
+        audioRaw: str,
+    ) -> any:
+        fileBytes = base64.b64decode(audioRaw)
+
+        segments, info = self.model.transcribe(BytesIO(fileBytes), beam_size=5)
+        segments_list = list(segments)  # The transcription will actually run here.
+
+        return " ".join([segment.text for segment in segments_list]).strip()
+
+
+inference_name = FasterWhisperInference.__name__
diff --git a/modules/faster_whisper/load.py b/modules/faster_whisper/load.py
@@ -0,0 +1,30 @@
+from modules.state import load_inference
+from modules.faster_whisper.inference import FasterWhisperInference, inference_name
+
+from pydantic import BaseModel
+
+
+class FasterWhisperData(BaseModel):
+    """
+    Task schema for loading FasterWhisper inference.
+
+    Attributes:
+    - model: str: The model size to download from HuggingFace hub.
+    - device: str: The device to load the model to.
+    """
+
+    model: str
+    device: str
+    force_reload: bool = True
+
+
+async def faster_whisper_load(data: FasterWhisperData):
+    """
+    Load a FasterWhisper model to RAM/VRAM.
+    """
+
+    load_inference(
+        inference_name,
+        FasterWhisperInference(model=data.model, device=data.device),
+        force_reload=data.force_reload,
+    )
diff --git a/modules/faster_whisper/unload.py b/modules/faster_whisper/unload.py
@@ -0,0 +1,10 @@
+from modules.state import unload_inference
+from modules.faster_whisper.inference import inference_name
+
+
+async def faster_whisper_unload():
+    """
+    Unload the FasterWhisper model from RAM/VRAM.
+    """
+
+    unload_inference(inference_name)
diff --git a/modules/state/__init__.py b/modules/state/__init__.py
@@ -14,14 +14,20 @@ def unload_inference(inference_name: str) -> None:
     gc.collect()
 
 
-def load_inference(inference_name: str, inference: any) -> None:
+def load_inference(inference_name: str, inference: any, force_reload=True) -> None:
     """
     Load inference module into state.
     """
 
-    if inference_name in state:
-        unload_inference(inference_name)
-    state[inference_name] = inference
+    if force_reload:
+        if inference_name in state:
+            unload_inference(inference_name)
+        state[inference_name] = inference
+    else:
+        if inference_name in state:
+            return
+        else:
+            state[inference_name] = inference
 
 
 def get_inference(inference_name: str) -> any:
diff --git a/server.py b/server.py
@@ -3,8 +3,19 @@
 sys.dont_write_bytecode = True
 
 
+def setup_cuda_env():
+    import os
+
+    os.environ["LD_LIBRARY_PATH"] = os.path.join(
+        os.getcwd(), "miniconda", "envs", "oc_external", "lib"
+    )
+
+
 if __name__ == "__main__":
+    setup_cuda_env()
+
     from modules.florence2 import setup_florence2
+    from modules.faster_whisper import setup_faster_whisper
 
     from custom_modules import get_custom_modules
 
@@ -37,6 +48,7 @@ async def ping():
 
     # Setup internal module routes.
     setup_florence2(app)
+    setup_faster_whisper(app)
 
     # Setup custom module routes.
     for custom_module_setup in get_custom_modules():
diff --git a/setup.bat b/setup.bat
diff --git a/setup.sh b/setup.sh