diff --git a/docs/api-inference/_redirects.yml b/docs/api-inference/_redirects.yml index 74f738f5f4..f0b5d38c5d 100644 --- a/docs/api-inference/_redirects.yml +++ b/docs/api-inference/_redirects.yml @@ -1,6 +1,6 @@ quicktour: index -detailed_parameters: parameters -parallelism: getting_started -usage: getting_started +detailed_parameters: tasks/index +parallelism: index +usage: index faq: index rate-limits: pricing diff --git a/docs/api-inference/_toctree.yml b/docs/api-inference/_toctree.yml index 91a9367484..7a11d8bec3 100644 --- a/docs/api-inference/_toctree.yml +++ b/docs/api-inference/_toctree.yml @@ -1,27 +1,33 @@ -- sections: +- title: Get Started + sections: - local: index - title: Serverless Inference API - - local: getting-started - title: Getting Started - - local: supported-models - title: Supported Models + title: Inference Providers - local: pricing - title: Pricing and Rate limits + title: Pricing and Billing + - local: hub-integration + title: Hub integration - local: security title: Security - title: Getting Started -- sections: - - local: parameters - title: Parameters - - sections: - - local: tasks/audio-classification - title: Audio Classification - - local: tasks/automatic-speech-recognition - title: Automatic Speech Recognition +- title: API Reference + sections: + - local: tasks/index + title: Index + - local: hub-api + title: Hub API + - title: Popular Tasks + sections: - local: tasks/chat-completion title: Chat Completion - local: tasks/feature-extraction title: Feature Extraction + - local: tasks/text-to-image + title: Text to Image + - title: Other Tasks + sections: + - local: tasks/audio-classification + title: Audio Classification + - local: tasks/automatic-speech-recognition + title: Automatic Speech Recognition - local: tasks/fill-mask title: Fill Mask - local: tasks/image-classification @@ -30,8 +36,6 @@ title: Image Segmentation - local: tasks/image-to-image title: Image to Image - - local: tasks/image-text-to-text - title: Image-Text to Text - local: tasks/object-detection title: Object Detection - local: tasks/question-answering @@ -44,13 +48,9 @@ title: Text Classification - local: tasks/text-generation title: Text Generation - - local: tasks/text-to-image - title: Text to Image - local: tasks/token-classification title: Token Classification - local: tasks/translation title: Translation - local: tasks/zero-shot-classification - title: Zero Shot Classification - title: Detailed Task Parameters - title: API Reference \ No newline at end of file + title: Zero Shot Classification \ No newline at end of file diff --git a/docs/api-inference/getting-started.md b/docs/api-inference/getting-started.md deleted file mode 100644 index fd13c24534..0000000000 --- a/docs/api-inference/getting-started.md +++ /dev/null @@ -1,95 +0,0 @@ -# Getting Started - -The Serverless Inference API allows you to easily do inference on a wide range of models and tasks. You can do requests with your favorite tools (Python, cURL, etc). We also provide a Python SDK (`huggingface_hub`) and JavaScript SDK (`huggingface.js`) to make it even easier. - -We'll do a minimal example using a [sentiment classification model](https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest). Please visit task-specific parameters and further documentation in our [API Reference](./parameters). - -## Getting a Token - -Using the Serverless Inference API requires passing a user token in the request headers. You can get a token by signing up on the Hugging Face website and then going to the [tokens page](https://huggingface.co/settings/tokens/new?globalPermissions=inference.serverless.write&tokenType=fineGrained). We recommend creating a `fine-grained` token with the scope to `Make calls to the serverless Inference API`. - -For more details about user tokens, check out [this guide](https://huggingface.co/docs/hub/en/security-tokens). - -## cURL - -```bash -curl 'https://api-inference.huggingface.co/models/cardiffnlp/twitter-roberta-base-sentiment-latest' \ --H "Authorization: Bearer hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" \ --H 'Content-Type: application/json' \ --d '{"inputs": "Today is a great day"}' -``` - -## Python - -You can use the `requests` library to make a request to the Inference API. - -```python -import requests - -API_URL = "https://api-inference.huggingface.co/models/cardiffnlp/twitter-roberta-base-sentiment-latest" -headers = {"Authorization": "Bearer hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"} -payload = { - "inputs": "Today is a great day", -} - -response = requests.post(API_URL, headers=headers, json=payload) -response.json() -``` - -Hugging Face also provides a [`InferenceClient`](https://huggingface.co/docs/huggingface_hub/guides/inference) that handles inference for you. Make sure to install it with `pip install huggingface_hub` first. - -```python -from huggingface_hub import InferenceClient - -client = InferenceClient( - "cardiffnlp/twitter-roberta-base-sentiment-latest", - token="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", -) - -client.text_classification("Today is a great day") -``` - -## JavaScript - -```js -import fetch from "node-fetch"; - -async function query(data) { - const response = await fetch( - "https://api-inference.huggingface.co/models/cardiffnlp/twitter-roberta-base-sentiment-latest", - { - method: "POST", - headers: { - Authorization: `Bearer hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx`, - "Content-Type": "application/json", - }, - body: JSON.stringify(data), - } - ); - const result = await response.json(); - return result; -} - -query({inputs: "Today is a great day"}).then((response) => { - console.log(JSON.stringify(response, null, 2)); -}); -``` - -Hugging Face also provides a [`HfInference`](https://huggingface.co/docs/huggingface.js/inference/classes/HfInference) client that handles inference. Make sure to install it with `npm install @huggingface/inference` first. - -```js -import { HfInference } from "@huggingface/inference"; - -const inference = new HfInference("hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"); - -const result = await inference.textClassification({ - model: "cardiffnlp/twitter-roberta-base-sentiment-latest", - inputs: "Today is a great day", -}); - -console.log(result); -``` - -## Next Steps - -Now that you know the basics, you can explore [API Reference](https://huggingface.co/docs/api-inference/parameters) doc to learn more about the parameters and task-specific settings. diff --git a/docs/api-inference/hub-api.md b/docs/api-inference/hub-api.md new file mode 100644 index 0000000000..fb1cf161c1 --- /dev/null +++ b/docs/api-inference/hub-api.md @@ -0,0 +1,173 @@ +# Hub API + +The Hub provides a few APIs to interact with Inference Providers. Here is a list of them: + +## List models + +To list models powered by a provider, use the `inference_provider` query parameter: + +```sh +# List all models served by Fireworks AI +~ curl -s https://huggingface.co/api/models?inference_provider=fireworks-ai | jq ".[].id" +"deepseek-ai/DeepSeek-V3-0324" +"deepseek-ai/DeepSeek-R1" +"Qwen/QwQ-32B" +"deepseek-ai/DeepSeek-V3" +... +``` + +It can be combined with other filters to e.g. select only `text-to-image` models: + +```sh +# List text-to-image models served by Fal AI +~ curl -s https://huggingface.co/api/models?inference_provider=fal-ai&pipeline_tag=text-to-image | jq ".[].id" +"black-forest-labs/FLUX.1-dev" +"stabilityai/stable-diffusion-3.5-large" +"black-forest-labs/FLUX.1-schnell" +"stabilityai/stable-diffusion-3.5-large-turbo" +... +``` + +Pass a comma-separated list of providers to select multiple: + +```sh +# List image-text-to-text models served by Novita or Sambanova +~ curl -s https://huggingface.co/api/models?inference_provider=sambanova,novita&pipeline_tag=image-text-to-text | jq ".[].id" +"meta-llama/Llama-3.2-11B-Vision-Instruct" +"meta-llama/Llama-3.2-90B-Vision-Instruct" +"Qwen/Qwen2-VL-72B-Instruct" +``` + +Finally, you can select all models served by at least one inference provider: + +```sh +# List text-to-video models served by any provider +~ curl -s https://huggingface.co/api/models?inference_provider=all&pipeline_tag=text-to-video | jq ".[].id" +"Wan-AI/Wan2.1-T2V-14B" +"Lightricks/LTX-Video" +"tencent/HunyuanVideo" +"Wan-AI/Wan2.1-T2V-1.3B" +"THUDM/CogVideoX-5b" +"genmo/mochi-1-preview" +"BagOu22/Lora_HKLPAZ" +``` + +## Get model status + +To find an inference provider for a specific model, request the `inference` attribute in the model info endpoint: + + + + + +```sh +# Get google/gemma-3-27b-it inference status (warm) +~ curl -s https://huggingface.co/api/models/google/gemma-3-27b-it?expand[]=inference +{ +"_id": "67c35b9bb236f0d365bf29d3", +"id": "google/gemma-3-27b-it", +"inference": "warm" +} +``` + + + + +In the `huggingface_hub`, use `model_info` with the expand parameter: + +```py +>>> from huggingface_hub import model_info + +>>> info = model_info("google/gemma-3-27b-it", expand="inference") +>>> info.inference +'warm' +``` + + + + + +Inference status is either "warm" or undefined: + + + + + +```sh +# Get inference status (no inference) +~ curl -s https://huggingface.co/api/models/manycore-research/SpatialLM-Llama-1B?expand[]=inference +{ +"_id": "67d3b141d8b6e20c6d009c8b", +"id": "manycore-research/SpatialLM-Llama-1B" +} +``` + + + + + +In the `huggingface_hub`, use `model_info` with the expand parameter: + +```py +>>> from huggingface_hub import model_info + +>>> info = model_info("manycore-research/SpatialLM-Llama-1B", expand="inference") +>>> info.inference +None +``` + + + + + +## Get model providers + +If you are interested by a specific model and want to check the list of providers serving it, you can request the `inferenceProviderMapping` attribute in the model info endpoint: + + + + + +```sh +# List google/gemma-3-27b-it providers +~ curl -s https://huggingface.co/api/models/google/gemma-3-27b-it?expand[]=inferenceProviderMapping +{ + "_id": "67c35b9bb236f0d365bf29d3", + "id": "google/gemma-3-27b-it", + "inferenceProviderMapping": { + "hf-inference": { + "status": "live", + "providerId": "google/gemma-3-27b-it", + "task": "conversational" + }, + "nebius": { + "status": "live", + "providerId": "google/gemma-3-27b-it-fast", + "task": "conversational" + } + } +} +``` + + + + +In the `huggingface_hub`, use `model_info` with the expand parameter: + +```py +>>> from huggingface_hub import model_info + +>>> info = model_info("google/gemma-3-27b-it", expand="inferenceProviderMapping") +>>> info.inference_provider_mapping +{ + 'hf-inference': InferenceProviderMapping(status='live', provider_id='google/gemma-3-27b-it', task='conversational'), + 'nebius': InferenceProviderMapping(status='live', provider_id='google/gemma-3-27b-it-fast', task='conversational'), +} +``` + + + + + + +Each provider serving the model shows a status (`staging` or `live`), the related task (here, `conversational`) and the providerId. In practice, this information is relevant for the JS and Python clients. diff --git a/docs/api-inference/hub-integration.md b/docs/api-inference/hub-integration.md new file mode 100644 index 0000000000..b325ebf0cb --- /dev/null +++ b/docs/api-inference/hub-integration.md @@ -0,0 +1,67 @@ +# Hub Integration + +Inference Providers is tightly integrated with the Hugging Face Hub. No matter which provider you use, the usage and billing will be centralized in your Hugging Face account. + +## Model search + +When listing models on the Hub, you can filter to select models deployed on the inference provider of your choice. For example, to list all models deployed on Fireworks AI infra: https://huggingface.co/models?inference_provider=fireworks-ai. + +
+ + +
+ +It is also possible to select all or multiple providers and filter their available models: https://huggingface.co/models?inference_provider=all. + +
+ + +
+ +## Features using Inference Providers + +Several Hugging Face features utilize Inference Providers and count towards your monthly credits. The included monthly credits for PRO and Enterprise should cover moderate usage of these features for most users. + +### Inference Widgets + +Interactive widgets available on model pages (e.g. [deepseek-ai/DeepSeek-V3-0324](https://huggingface.co/deepseek-ai/DeepSeek-V3-0324)). This is the entry point to quickly test a model on the Hub. + +
+ + +
+ +### Inference Playground + +A comprehensive chat interface supporting various models and providers available at https://huggingface.co/playground. + +
+ + +
+ +### Data Studio AI + +Converts text to SQL queries on dataset pages (e.g. [open-r1/codeforces-cots](https://huggingface.co/datasets/open-r1/codeforces-cots/viewer)). + +
+ + +
+ +## User Settings + +In your user account settings, you are able to: +- set your own API keys for the providers you’ve signed up with. If you don't, your requests will be billed on your HF account. More details in the [billing section](./pricing#routed-requests-vs-direct-calls). + +
+ + +
+ +- order providers by preference. This applies to the widget and code snippets in the model pages. + +
+ + +
diff --git a/docs/api-inference/index.md b/docs/api-inference/index.md index 69926ce634..a27bb14452 100644 --- a/docs/api-inference/index.md +++ b/docs/api-inference/index.md @@ -1,61 +1,172 @@ -# Serverless Inference API +# Inference Providers -**Instant Access to thousands of ML Models for Fast Prototyping** +Hugging Face Inference Providers simplify and unify how developers access and run machine learning models by offering a unified, flexible interface to multiple serverless inference providers. This new approach extends our previous Serverless Inference API, providing more models, increased performances and better reliability thanks to our inference partners. -Explore the most popular models for text, image, speech, and more — all with a simple API request. Build, test, and experiment without worrying about infrastructure or setup. +To learn more about the launch of Inference Providers, check out our [announcement blog post](https://huggingface.co/blog/inference-providers). ---- +## Why use Inference Providers? -## Why use the Inference API? - -The Serverless Inference API offers a fast and simple way to explore thousands of models for a variety of tasks. Whether you're prototyping a new application or experimenting with ML capabilities, this API gives you instant access to high-performing models across multiple domains: +Inference Providers offers a fast and simple way to explore thousands of models for a variety of tasks. Whether you're experimenting with ML capabilities or building a new application, this API gives you instant access to high-performing models across multiple domains: * **Text Generation:** Including large language models and tool-calling prompts, generate and experiment with high-quality responses. -* **Image Generation:** Easily create customized images, including LoRAs for your own styles. +* **Image and Video Generation:** Easily create customized images, including LoRAs for your own styles. * **Document Embeddings:** Build search and retrieval systems with SOTA embeddings. * **Classical AI Tasks:** Ready-to-use models for text classification, image classification, speech recognition, and more. -⚡ **Fast and Free to Get Started**: The Inference API is free to try out and comes with additional included credits for PRO users. For production needs, explore [Inference Endpoints](https://ui.endpoints.huggingface.co/) for dedicated resources, autoscaling, advanced security features, and more. - ---- - -## Key Benefits - -- 🚀 **Instant Prototyping:** Access powerful models without setup. -- 🎯 **Diverse Use Cases:** One API for text, image, and beyond. -- 🔧 **Developer-Friendly:** Simple requests, fast responses. - ---- - -## Main Features +⚡ **Fast and Free to Get Started**: Inference Providers comes with a free-tier and additional included credits for [PRO users](https://hf.co/subscribe/pro), as well as [Enterprise Hub organizations](https://huggingface.co/enterprise). -* Leverage over 800,000+ models from different open-source libraries (transformers, sentence transformers, adapter transformers, diffusers, timm, etc.). -* Use models for a variety of tasks, including text generation, image generation, document embeddings, NER, summarization, image classification, and more. -* Accelerate your prototyping by using GPU-powered models. -* Run very large models that are challenging to deploy in production. -* Production-grade platform without the hassle: built-in automatic scaling, load balancing and caching. +## Key Features ---- +- **🎯 All-in-One API**: A single API for text generation, image generation, document embeddings, NER, summarization, image classification, and more. +- **🔀 Multi-Provider Support**: Easily run models from top-tier providers like fal, Replicate, Sambanova, Together AI, and others. +- **🚀 Scalable & Reliable**: Built for high availability and low-latency performance in production environments. +- **🔧 Developer-Friendly**: Simple requests, fast responses, and a consistent developer experience across Python and JavaScript clients. +- **💰 Cost-Effective**: No extra markup on provider rates. -## Contents - -The documentation is organized into two sections: - -* **Getting Started** Learn the basics of how to use the Inference API. -* **API Reference** Dive into task-specific settings and parameters. - ---- ## Inference Playground -If you want to get started quickly with [Chat Completion models](https://huggingface.co/models?inference=warm&other=conversational&sort=trending) use the Inference Playground to quickly test and compare models against your prompts. +To get started quickly with [Chat Completion models](http://huggingface.co/models?inference_provider=all&sort=trending&other=conversational), use the [Inference Playground](https://huggingface.co/playground) to easily test and compare models with your prompts. - - ---- -## Serious about AI in your organisation? Build faster with the Hugging Face Enterprise Hub. + - - Hugging Face Enterprise Hub -
+## Get Started + +You can use Inference Providers with your preferred tools, such as Python, JavaScript, or cURL. To simplify integration, we offer both a Python SDK (`huggingface_hub`) and a JavaScript SDK (`huggingface.js`). + +In this section, we will demonstrate a simple example using [deepseek-ai/DeepSeek-V3-0324](https://huggingface.co/deepseek-ai/DeepSeek-V3-0324), a conversational Large Language Model. For the example, we will use [Novita AI](https://novita.ai/) as Inference Provider. + +### Authentication + +Inference Providers requires passing a user token in the request headers. You can generate a token by signing up on the Hugging Face website and going to the [settings page](https://huggingface.co/settings/tokens/new?ownUserPermissions=inference.serverless.write&tokenType=fineGrained). We recommend creating a `fine-grained` token with the scope to `Make calls to Inference Providers`. + +For more details about user tokens, check out [this guide](https://huggingface.co/docs/hub/en/security-tokens). + +### cURL + +Let's start with a cURL command highlighting the raw HTTP request. You can adapt this request to be run with the tool of your choice. + +```bash +curl https://router.huggingface.co/novita/v3/openai/chat/completions \ + -H "Authorization: Bearer $HF_TOKEN" \ + -H 'Content-Type: application/json' \ + -d '{ + "messages": [ + { + "role": "user", + "content": "How many G in huggingface?" + } + ], + "model": "deepseek/deepseek-v3-0324", + "stream": false + }' +``` + +### Python + +In Python, you can use the `requests` library to make raw requests to the API: + +```python +import requests + +API_URL = "https://router.huggingface.co/novita/v3/openai/chat/completions" +headers = {"Authorization": "Bearer hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"} +payload = { + "messages": [ + { + "role": "user", + "content": "How many 'G's in 'huggingface'?" + } + ], + "model": "deepseek/deepseek-v3-0324", +} + +response = requests.post(API_URL, headers=headers, json=payload) +print(response.json()["choices"][0]["message"]) +``` + +For convenience, the Python library `huggingface_hub` provides an [`InferenceClient`](https://huggingface.co/docs/huggingface_hub/guides/inference) that handles inference for you. Make sure to install it with `pip install huggingface_hub`. + +```python +from huggingface_hub import InferenceClient + +client = InferenceClient( + provider="novita", + api_key="hf_xxxxxxxxxxxxxxxxxxxxxxxx", +) + +completion = client.chat.completions.create( + model="deepseek-ai/DeepSeek-V3-0324", + messages=[ + { + "role": "user", + "content": "How many 'G's in 'huggingface'?" + } + ], +) + +print(completion.choices[0].message) +``` + +### JavaScript + +In JS, you can use the `fetch` library to make raw requests to the API: + + +```js +import fetch from "node-fetch"; + +const response = await fetch( + "https://router.huggingface.co/novita/v3/openai/chat/completions", + { + method: "POST", + headers: { + Authorization: `Bearer hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx`, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + provider: "novita", + model: "deepseek-ai/DeepSeek-V3-0324", + messages: [ + { + role: "user", + content: "How many 'G's in 'huggingface'?", + }, + ], + }), + } +); +console.log(await response.json()); +``` + +For convenience, the JS library `@huggingface/inference` provides an [`InferenceClient`](https://huggingface.co/docs/huggingface.js/inference/classes/InferenceClient) that handles inference for you. You can install it with `npm install @huggingface/inference`. + + +```js +import { InferenceClient } from "@huggingface/inference"; + +const client = new InferenceClient("hf_xxxxxxxxxxxxxxxxxxxxxxxx"); + +const chatCompletion = await client.chatCompletion({ + provider: "novita", + model: "deepseek-ai/DeepSeek-V3-0324", + messages: [ + { + role: "user", + content: "How many 'G's in 'huggingface'?", + }, + ], +}); + +console.log(chatCompletion.choices[0].message); +``` + +## Next Steps + +In this introduction, we've covered the basics of Inference Providers. To learn more about this service, check out our guides and API Reference: +- [Pricing and Billing](./pricing): everything you need to know about billing +- [Hub integration](./hub-integration): how is Inference Providers integrated with the Hub? +- [External Providers](./providers): everything about providers and how to become an official partner +- [Hub API](./hub-api): high-level API for Inference Providers +- [API Reference](./tasks/index): learn more about the parameters and task-specific settings. diff --git a/docs/api-inference/parameters.md b/docs/api-inference/parameters.md deleted file mode 100644 index b225cafd5f..0000000000 --- a/docs/api-inference/parameters.md +++ /dev/null @@ -1,145 +0,0 @@ -# Parameters - - -## Additional Options - -### Caching - -There is a cache layer on the inference API to speed up requests when the inputs are exactly the same. Many models, such as classifiers and embedding models, can use those results as is if they are deterministic, meaning the results will be the same. However, if you use a nondeterministic model, you can disable the cache mechanism from being used, resulting in a real new query. - -To do this, you can add `x-use-cache:false` to the request headers. For example - - - - -```diff -curl https://api-inference.huggingface.co/models/MODEL_ID \ - -X POST \ - -d '{"inputs": "Can you please let us know more details about your "}' \ - -H "Authorization: Bearer hf_***" \ - -H "Content-Type: application/json" \ -+ -H "x-use-cache: false" -``` - - - -```diff -import requests - -API_URL = "https://api-inference.huggingface.co/models/MODEL_ID" -headers = { - "Authorization": "Bearer hf_***", - "Content-Type": "application/json", -+ "x-use-cache": "false" -} -data = { - "inputs": "Can you please let us know more details about your " -} -response = requests.post(API_URL, headers=headers, json=data) -print(response.json()) -``` - - - - -```diff -import fetch from "node-fetch"; - -async function query(data) { - const response = await fetch( - "https://api-inference.huggingface.co/models/MODEL_ID", - { - method: "POST", - headers: { - Authorization: `Bearer hf_***`, - "Content-Type": "application/json", -+ "x-use-cache": "false" - }, - body: JSON.stringify(data), - } - ); - const result = await response.json(); - return result; -} - -query({ - inputs: "Can you please let us know more details about your " -}).then((response) => { - console.log(JSON.stringify(response, null, 2)); -}); - -``` - - - - - -### Wait for the model - -When a model is warm, it is ready to be used and you will get a response relatively quickly. However, some models are cold and need to be loaded before they can be used. In that case, you will get a 503 error. Rather than doing many requests until it's loaded, you can wait for the model to be loaded by adding `x-wait-for-model:true` to the request headers. We suggest to only use this flag to wait for the model to be loaded when you are sure that the model is cold. That means, first try the request without this flag and only if you get a 503 error, try again with this flag. - - - - - -```diff -curl https://api-inference.huggingface.co/models/MODEL_ID \ - -X POST \ - -d '{"inputs": "Can you please let us know more details about your "}' \ - -H "Authorization: Bearer hf_***" \ - -H "Content-Type: application/json" \ -+ -H "x-wait-for-model: true" -``` - - - -```diff -import requests - -API_URL = "https://api-inference.huggingface.co/models/MODEL_ID" -headers = { - "Authorization": "Bearer hf_***", - "Content-Type": "application/json", -+ "x-wait-for-model": "true" -} -data = { - "inputs": "Can you please let us know more details about your " -} -response = requests.post(API_URL, headers=headers, json=data) -print(response.json()) -``` - - - - -```diff -import fetch from "node-fetch"; - -async function query(data) { - const response = await fetch( - "https://api-inference.huggingface.co/models/MODEL_ID", - { - method: "POST", - headers: { - Authorization: `Bearer hf_***`, - "Content-Type": "application/json", -+ "x-wait-for-model": "true" - }, - body: JSON.stringify(data), - } - ); - const result = await response.json(); - return result; -} - -query({ - inputs: "Can you please let us know more details about your " -}).then((response) => { - console.log(JSON.stringify(response, null, 2)); -}); - -``` - - - - \ No newline at end of file diff --git a/docs/api-inference/pricing.md b/docs/api-inference/pricing.md index 79ecb1014e..4c095a088a 100644 --- a/docs/api-inference/pricing.md +++ b/docs/api-inference/pricing.md @@ -1,30 +1,80 @@ -# Pricing and Rate limits +# Pricing and Billing -As a HF user, you get monthly credits to run the HF Inference API. The amount of credits you get depends on your type of account (Free or PRO or Enterprise Hub), see table below. -You get charged for every inference request, based on the compute time x price of the underlying hardware. +Inference Providers is a production-ready service involving external partners and is therefore a paid product. However, as a Hugging Face user, you get monthly credits to run experiments. The amount of credits you get depends on your type of account: + +| User Tier | Included monthly credits | +| ------------------------ | ---------------------------------- | +| Free Users | subject to change, less than $0.10 | +| PRO and Enterprise Users | $2.00 | + +## Pay-as-you-Go + +**PRO and Enterprise Hub users** can continue using the API once their monthly included credits are exhausted. This billing model, known as "Pay-as-you-Go" (PAYG), is charged on top of the monthly subscription. PAYG is only available for providers that are integrated with our billing system. We're actively working to integrate all providers, but in the meantime, any providers that are not yet integrated will be blocked once the free-tier limit is reached. + +If you have remaining credits, we estimate costs for providers that aren’t fully integrated with our billing system. These estimates are usually higher than the actual cost to prevent abuse, which is why PAYG is currently disabled for those providers. + +You can track your spending on your [billing page](https://huggingface.co/settings/billing). + + + +Hugging Face charges you the same rates as the provider, with no additional fees. + + + +## Routed requests vs direct calls + +The documentation above assumes you are making routed requests to external providers. In practice, there are 3 different ways to run inference, each with unique billing implications: + +- **Routed Request**: This is the default method for using Inference Providers. Simply use the JavaScript or Python `InferenceClient`, or make raw HTTP requests with your Hugging Face User Access Token. Your request is automatically routed through Hugging Face to the provider's platform. No separate provider account is required, and billing is managed directly by Hugging Face. This approach lets you seamlessly switch between providers without additional setup. + +- **Routed Request with Custom Key**: In your [settings page](https://huggingface.co/settings/inference-providers) on the Hub, you can configure a custom key for each provider. To use this option, you'll need to create an account on the provider's platform, and billing will be handled directly by that provider. Hugging Face won't charge you for the call. This method gives you more control over billing when experimenting with models on the Hub. When making a routed request with a custom key, your code remains unchanged—you'll still pass your Hugging Face User Access Token. Hugging Face will automatically swap the authentication when routing the request. + +- **Direct Calls**: If you provide a custom key when using the JavaScript or Python `InferenceClient`, the call will be made directly to the provider's platform. Billing is managed by the provider, and Hugging Face is not notified of the request. This option is ideal if you want to use the unified `InferenceClient` interface without routing through Hugging Face. + +Here is a table that sums up what we've seen so far: + +| | HF routing | Billed by | Free-tier included | Pay-as-you-go | Integration | +| ---------------------------------- | ---------- | ------------ | ------------------ | ----------------------------------------------- | ----------------------------------------- | +| **Routed request** | Yes | Hugging Face | Yes | Only for PRO users and for integrated providers | SDKs, Playground, widgets, Data AI Studio | +| **Routed request with custom key** | Yes | Provider | No | Yes | SDKs, Playground, widgets, Data AI Studio | +| **Direct call** | No | Provider | No | Yes | SDKs only | + +## HF-Inference cost + +As you may have noticed, you can select to work with `"hf-inference"` provider. This service used to be "Inference API (serverless)" prior to Inference Providers. From a user point of view, working with HF Inference is the same as with any other provider. Past the free-tier credits, you get charged for every inference request based on the compute time x price of the underlying hardware. For instance, a request to [black-forest-labs/FLUX.1-dev](https://huggingface.co/black-forest-labs/FLUX.1-dev) that takes 10 seconds to complete on a GPU machine that costs $0.00012 per second to run, will be billed $0.0012. -When your monthly included credits are depleted: -- if you're a Free user, you won't be able to query the Inference API anymore, -- if you're a PRO or Enterprise Hub user, you will get charged for the requests on top of your subscription. You can monitor your spending on your billing page. +The `"hf-inference"` provider is currently the default provider when working with the JavaScript and Python SDKs. Note that this default might change in the future. + +## Organization billing -Note that HF Inference API is not meant to be used for heavy production applications. If you need to handle large numbers of requests, consider [Inference Endpoints](https://huggingface.co/docs/inference-endpoints) to have dedicated resources or [Inference Providers](https://huggingface.co/blog/inference-providers) for serverless usage. +For Enterprise Hub organizations, it is possible to centralize billing for all your users. Each user still uses their own User Access Token but the requests are billed to your organization. This can be done by passing `"X-HF-Bill-To: my-org-name"` as header in your HTTP requests. -You need to be authenticated (passing a token or through your browser) to use the Inference API. +If you are using the JavaScript `InferenceClient`, you can set the `billTo` attribute at a client level: +```js +import { InferenceClient } from "@huggingface/inference"; -| User Tier | Included monthly credits | -|---------------------------|------------------------------------| -| Free Users | subject to change, less than $0.10 | -| PRO and Enterprise Users | $2.00 | +const client = new InferenceClient("hf_token", { billTo: "my-org-name" }); -### Features using Inference Providers +const image = await client.textToImage({ + model: "black-forest-labs/FLUX.1-schnell", + inputs: "A majestic lion in a fantasy forest", + provider: "fal-ai", +}); +/// Use the generated image (it's a Blob) +``` -Several Hugging Face features utilize the Inference Providers API and count towards your monthly credits: +And similarly in Python: -- Inference Widgets: Interactive widgets available on model pages. -- [Inference Playground](https://huggingface.co/playground): A comprehensive chat interface supporting various models and providers. -- Data Studio AI: Converts text to SQL queries for datasets. +```py +from huggingface_hub import InferenceClient +client = InferenceClient(provider="fal-ai", bill_to="my-org-name") +image = client.text_to_image( + "A majestic lion in a fantasy forest", + model="black-forest-labs/FLUX.1-schnell", +) +image.save("lion.png") +``` -The included monthly credits for PRO and Enterprise should cover moderate usage of these features for most users. diff --git a/docs/api-inference/security.md b/docs/api-inference/security.md index 2e2c3fcd89..fa1997c3d2 100644 --- a/docs/api-inference/security.md +++ b/docs/api-inference/security.md @@ -1,15 +1,15 @@ # Security & Compliance -The Inference API is not designed for heavy production requirements. For production needs, explore [Inference Endpoints](https://ui.endpoints.huggingface.co/) for dedicated resources, autoscaling, advanced security features, and more. - ## Data Security/Privacy -Hugging Face does not store any user data for training purposes. Tokens sent to the API might be stored in a short-term (few minutes) cache mechanism to speed-up repeated requests. Logs are stored for debugging for up to 30 days. Any additional data in terms of user data or tokens are not stored. +Hugging Face does not store any user data for training purposes. We do not store the request body or response when routing requests through Hugging Face. Logs are kept for debugging purposes for up to 30 days, but no user data or tokens are stored. + +For more information on how your data is handled, please refer to the Data Security Policies of each provider. -Serverless Inference API use TLS/SSL to encrypt the data in transit. +Inference Provider routing uses TLS/SSL to encrypt data in transit. ## Hub Security -The Hugging Face Hub, which Serverless Inference API is part, is SOC2 Type 2 certified. For more on Hub security: https://huggingface.co/docs/hub/security +The Hugging Face Hub, which Inference Providers is a feature of, is SOC2 Type 2 certified. For more on Hub security: https://huggingface.co/docs/hub/security. External providers are responsible for their own security measures, so please refer to their respective security policies for more details. diff --git a/docs/api-inference/supported-models.md b/docs/api-inference/supported-models.md deleted file mode 100644 index e58a117786..0000000000 --- a/docs/api-inference/supported-models.md +++ /dev/null @@ -1,30 +0,0 @@ -# Supported Models - -Given the fast-paced nature of the open ML ecosystem, the Inference API exposes models that have large community interest and are in active use (based on recent likes, downloads, and usage). Because of this, deployed models can be swapped without prior notice. The Hugging Face stack aims to keep all the latest popular models warm and ready to use. - -You can find: - -* **[Warm models](https://huggingface.co/models?inference=warm&sort=trending):** models ready to be used. -* **[Cold models](https://huggingface.co/models?inference=cold&sort=trending):** models that are not loaded but can be used. -* **[Frozen models](https://huggingface.co/models?inference=frozen&sort=trending):** models that currently can't be run with the API. - -## What do I get with a PRO subscription? - -In addition to thousands of public models available in the Hub, PRO and Enterprise users get higher [included credits](./pricing) and access to the following models: - - - -| Model | Size | Supported Context Length | Use | -|--------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------|--------------------------------------------------------------| -| Meta Llama 3.1 Instruct | [8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct), [70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | 70B: 32k tokens / 8B: 8k tokens | High quality multilingual chat model with large context length | -| Meta Llama 3 Instruct | [8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct), [70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) | 8k tokens | One of the best chat models | -| Meta Llama Guard 3 | [8B](https://huggingface.co/meta-llama/Llama-Guard-3-8B) | 4k tokens | | -| Llama 2 Chat | [7B](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf), [13B](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf), [70B](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 4k tokens | One of the best conversational models | -| DeepSeek Coder v2 | [236B](https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Instruct) | 16k tokens | A model with coding capabilities. | -| Bark | [0.9B](https://huggingface.co/suno/bark) | - | Text to audio generation | - -This list is not exhaustive and might be updated in the future. - -## Running Private Models - -The Serverless API is designed to run popular public models. If you have a private model, you can use [Inference Endpoints](https://huggingface.co/docs/inference-endpoints) to deploy it. diff --git a/docs/api-inference/tasks/image-text-to-text.md b/docs/api-inference/tasks/image-text-to-text.md deleted file mode 100644 index 14903fb01a..0000000000 --- a/docs/api-inference/tasks/image-text-to-text.md +++ /dev/null @@ -1,128 +0,0 @@ - - -## Image-Text to Text - -Image-text-to-text models take in an image and text prompt and output text. These models are also called vision-language models, or VLMs. The difference from image-to-text models is that these models take an additional text input, not restricting the model to certain use cases like image captioning, and may also be trained to accept a conversation as input. - - - -For more details about the `image-text-to-text` task, check out its [dedicated page](https://huggingface.co/tasks/image-text-to-text)! You will find examples and related materials. - - - -### Recommended models - -- [Qwen/Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct): Strong image-text-to-text model. - -Explore all available models and find the one that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=image-text-to-text&sort=trending). - -### Using the API - - - - - -```bash -curl https://router.huggingface.co/hf-inference/models/Qwen/Qwen2.5-VL-7B-Instruct \ - -X POST \ - -d '{"inputs": "Can you please let us know more details about your "}' \ - -H 'Content-Type: application/json' \ - -H 'Authorization: Bearer hf_***' -``` - - - -Using `huggingface_hub`: -```py -from huggingface_hub import InferenceClient - -client = InferenceClient( - provider="hf-inference", - api_key="hf_***" -) - -messages = "\"Can you please let us know more details about your \"" - -stream = client.chat.completions.create( - model="Qwen/Qwen2.5-VL-7B-Instruct", - messages=messages, - max_tokens=500, - stream=True -) - -for chunk in stream: - print(chunk.choices[0].delta.content, end="") -``` - -Using `openai`: -```py -from openai import OpenAI - -client = OpenAI( - base_url="https://router.huggingface.co/hf-inference/v1", - api_key="hf_***" -) - -messages = "\"Can you please let us know more details about your \"" - -stream = client.chat.completions.create( - model="Qwen/Qwen2.5-VL-7B-Instruct", - messages=messages, - max_tokens=500, - stream=True -) - -for chunk in stream: - print(chunk.choices[0].delta.content, end="") -``` - -To use the Python client, see `huggingface_hub`'s [package reference](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.image_text_to_text). - - - -```js -async function query(data) { - const response = await fetch( - "https://router.huggingface.co/hf-inference/models/Qwen/Qwen2.5-VL-7B-Instruct", - { - headers: { - Authorization: "Bearer hf_***", - "Content-Type": "application/json", - }, - method: "POST", - body: JSON.stringify(data), - } - ); - const result = await response.json(); - return result; -} - -query({"inputs": "Can you please let us know more details about your "}).then((response) => { - console.log(JSON.stringify(response)); -}); -``` - -To use the JavaScript client, see `huggingface.js`'s [package reference](https://huggingface.co/docs/huggingface.js/inference/classes/HfInference#imagetexttotext). - - - - - - -### API specification - -For the API specification of conversational image-text-to-text models, please refer to the [Chat Completion API documentation](https://huggingface.co/docs/api-inference/tasks/chat-completion#api-specification). - - diff --git a/docs/api-inference/tasks/index.md b/docs/api-inference/tasks/index.md new file mode 100644 index 0000000000..60b15a8a01 --- /dev/null +++ b/docs/api-inference/tasks/index.md @@ -0,0 +1,179 @@ +# API Reference + +## Popular tasks + +
+
+ + +
+ Chat Completion +

+ Generate a response given a list of messages in a conversational context. +

+
+ + +
+ Feature Extraction +

+ Converting a text into a vector often called "embedding". +

+
+ + +
+ Text to Image +

+ Generate an image based on a given text prompt. +

+
+ +
+
+ +## Other tasks + +
+
+ + +
+ Audio Classification +

+ Audio classification is the task of assigning a label or class to a given audio. +

+
+ + +
+ Automatic Speech Recognition +

+ Automatic Speech Recognition (ASR), also known as Speech to Text (STT), is the task of transcribing a given audio to text. +

+
+ + +
+ Fill Mask +

+ Mask filling is the task of predicting the right word (token to be precise) in the middle of a sequence. +

+
+ + +
+ Image Classification +

+ Image classification is the task of assigning a label or class to an entire image. Images are expected to have only one class for each image. +

+
+ + +
+ Image Segmentation +

+ Image Segmentation divides an image into segments where each pixel in the image is mapped to an object. +

+
+ + +
+ Image to Image +

+ Image-to-image is the task of transforming a source image to match the characteristics of a target image or a target image domain. +

+
+ + +
+ Object Detection +

+ Object Detection models allow users to identify objects of certain defined classes. +

+
+ + +
+ Question Answering +

+ Question Answering models can retrieve the answer to a question from a given text, which is useful for searching for an answer in a document. +

+
+ + +
+ Summarization +

+ Summarization is the task of producing a shorter version of a document while preserving its important information. +

+
+ + +
+ Table Question Answering +

+ Table Question Answering (Table QA) is the answering a question about an information on a given table. +

+
+ + +
+ Text Classification +

+ Text Classification is the task of assigning a label or class to a given text. +

+
+ + +
+ Text Generation +

+ Generate text based on a prompt. +

+
+ + +
+ Token Classification +

+ Token classification is a task in which a label is assigned to some tokens in a text. +

+
+ + +
+ Translation +

+ Translation is the task of converting text from one language to another. +

+
+ + +
+ Zero Shot Classification +

+ Zero shot classification is the task to classify text without specific training for the task. +

+
+ +
+
\ No newline at end of file