huggingface · merveenoyan · Mar 24, 2025 · Mar 24, 2025 · Mar 24, 2025 · Mar 24, 2025
diff --git a/src/smolagents/models.py b/src/smolagents/models.py
@@ -656,6 +656,7 @@ def __init__(
             raise ModuleNotFoundError(
                 "Please install 'transformers' extra to use 'TransformersModel': `pip install 'smolagents[transformers]'`"
             )
+        import huggingface_hub
         import torch
         from transformers import AutoModelForCausalLM, AutoModelForImageTextToText, AutoProcessor, AutoTokenizer
 
@@ -680,27 +681,42 @@ def __init__(
         if device_map is None:
             device_map = "cuda" if torch.cuda.is_available() else "cpu"
         logger.info(f"Using device: {device_map}")
-        self._is_vlm = False
-        try:
-            self.model = AutoModelForCausalLM.from_pretrained(
-                model_id,
-                device_map=device_map,
-                torch_dtype=torch_dtype,
-                trust_remote_code=trust_remote_code,
+
+        if os.path.exists(model_id):
+            readme_path = f"{model_id}/README.md"
+            with open(readme_path, "r") as f:
+                for line in f:
+                    if "pipeline_tag" in line:
+                        pipeline_tag = line.split(":")[1].strip()
+        else:
+            api = huggingface_hub.HfApi()
+            pipeline_tag = api.model_info(model_id).pipeline_tag
+
+        if pipeline_tag == "image-text-to-text":
+            self._is_vlm = True
+        elif pipeline_tag == "text-generation":
+            self._is_vlm = False
+        else:
+            raise ValueError(
+                f"Unsupported task, model has to be either a vision language model or a language model: {pipeline_tag}"
             )
-            self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=trust_remote_code)
-        except ValueError as e:
-            if "Unrecognized configuration class" in str(e):
+        try:
+            if not self._is_vlm:
+                self.model = AutoModelForCausalLM.from_pretrained(
+                    model_id,
+                    device_map=device_map,
+                    torch_dtype=torch_dtype,
+                    trust_remote_code=trust_remote_code,
+                )
+                self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=trust_remote_code)
+            else:
                 self.model = AutoModelForImageTextToText.from_pretrained(
                     model_id,
                     device_map=device_map,
                     torch_dtype=torch_dtype,
                     trust_remote_code=trust_remote_code,
                 )
                 self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=trust_remote_code)
-                self._is_vlm = True
-            else:
-                raise e
         except Exception as e:
             raise ValueError(f"Failed to load tokenizer and model for {model_id=}: {e}") from e
         super().__init__(flatten_messages_as_text=not self._is_vlm, **kwargs)