Skip to content

Commit 9a46519

Browse files
committed
py: Add base_model_sources and dataset_sources to metadata heuristics
This is to address "Model Card: Allow for dicts in datasets and base_model and also update spec" in huggingface/huggingface_hub#2479 where we would like to add detailed metadata support for both base model and datashet but in a way that huggingface will eventually be able to support (They are currently using either a string or string list... we will be using a list of dict which would be extensible). They recommended creating a seperate metadata property for this.
1 parent 6400391 commit 9a46519

File tree

1 file changed

+8
-4
lines changed

1 file changed

+8
-4
lines changed

gguf-py/gguf/metadata.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -348,12 +348,12 @@ def use_array_model_card_metadata(metadata_key: str, model_card_key: str):
348348
use_model_card_metadata("author", "model_creator")
349349
use_model_card_metadata("basename", "model_type")
350350

351-
if "base_model" in model_card or "base_models" in model_card:
351+
if "base_model" in model_card or "base_models" in model_card or "base_model_sources" in model_card:
352352
# This represents the parent models that this is based on
353353
# Example: stabilityai/stable-diffusion-xl-base-1.0. Can also be a list (for merges)
354354
# Example of merges: https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0.1/blob/main/README.md
355355
metadata_base_models = []
356-
base_model_value = model_card.get("base_model", model_card.get("base_models", None))
356+
base_model_value = model_card.get("base_model", model_card.get("base_models", model_card.get("base_model_sources", None)))
357357

358358
if base_model_value is not None:
359359
if isinstance(base_model_value, str):
@@ -402,14 +402,16 @@ def use_array_model_card_metadata(metadata_key: str, model_card_key: str):
402402

403403
elif isinstance(model_id, dict):
404404
base_model = model_id
405+
405406
else:
406407
logger.error(f"base model entry '{str(model_id)}' not in a known format")
408+
407409
metadata.base_models.append(base_model)
408410

409-
if "datasets" in model_card or "dataset" in model_card:
411+
if "datasets" in model_card or "dataset" in model_card or "dataset_sources" in model_card:
410412
# This represents the datasets that this was trained from
411413
metadata_datasets = []
412-
dataset_value = model_card.get("datasets", model_card.get("dataset", None))
414+
dataset_value = model_card.get("datasets", model_card.get("dataset", model_card.get("dataset_sources", None)))
413415

414416
if dataset_value is not None:
415417
if isinstance(dataset_value, str):
@@ -458,8 +460,10 @@ def use_array_model_card_metadata(metadata_key: str, model_card_key: str):
458460

459461
elif isinstance(dataset_id, dict):
460462
dataset = dataset_id
463+
461464
else:
462465
logger.error(f"dataset entry '{str(dataset_id)}' not in a known format")
466+
463467
metadata.datasets.append(dataset)
464468

465469
use_model_card_metadata("license", "license")

0 commit comments

Comments
 (0)