Liquid4All · ykhrustalev · Feb 24, 2026 · Feb 24, 2026
@@ -1,6 +1,6 @@
 ---
 title: "Transformers"
 description: "Transformers is a library for inference and training of pretrained models."
 ---

 <Tip>
@@ -53,12 +53,14 @@
 
 # Generate answer
 prompt = "What is C. elegans?"
-input_ids = tokenizer.apply_chat_template(
+inputs = tokenizer.apply_chat_template(
     [{"role": "user", "content": prompt}],
     add_generation_prompt=True,
     return_tensors="pt",
     tokenize=True,
-).to(model.device)
+    return_dict=True,
+)
+input_ids = inputs["input_ids"].to(model.device)
 
 output = model.generate(input_ids, do_sample=True, temperature=0.1, top_k=50, repetition_penalty=1.05, max_new_tokens=512)
 
@@ -73,7 +75,7 @@

 * **`model_id`**: Can be a Hugging Face model ID (e.g., `"LiquidAI/LFM2.5-1.2B-Instruct"`) or a local path
 * **`device_map="auto"`**: Automatically distributes across available GPUs/CPU (requires `accelerate`). Use `device="cuda"` for single GPU or `device="cpu"` for CPU only
 * **`dtype="bfloat16"`**: Recommended for modern GPUs. Use `"auto"` for automatic selection, or `"float32"` (slower, more memory)

 <Accordion title="Click to see a pipeline() example">
  The [`pipeline()`](https://huggingface.co/docs/transformers/en/main_classes/pipelines) interface provides a simpler API for text generation with automatic chat template handling. It wraps model loading and tokenization, making it ideal for quick prototyping.
@@ -113,7 +115,7 @@

 * **`do_sample`** (`bool`): Enable sampling (`True`) or greedy decoding (`False`, default)
 * **`temperature`** (`float`, default 1.0): Controls randomness (0.0 = deterministic, higher = more random). Typical range: 0.1-2.0
 * **`top_p`** (`float`, default 1.0): Nucleus sampling - limits to tokens with cumulative probability ≤ top\_p. Typical range: 0.1-1.0
 * **`top_k`** (`int`, default 50): Limits to top-k most probable tokens. Typical range: 1-100
 * **`min_p`** (`float`): Minimum token probability threshold. Typical range: 0.01-0.2
 * **`max_new_tokens`** (`int`): Maximum number of tokens to generate (preferred over `max_length`)
@@ -149,12 +151,14 @@
 
 # Use the model and tokenizer setup from Basic Usage above
 prompt = "Tell me a story about space exploration."
-input_ids = tokenizer.apply_chat_template(
+inputs = tokenizer.apply_chat_template(
     [{"role": "user", "content": prompt}],
     add_generation_prompt=True,
     return_tensors="pt",
     tokenize=True,
-).to(model.device)
+    return_dict=True,
+)
+input_ids = inputs["input_ids"].to(model.device)
 
 streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
 output = model.generate(input_ids, do_sample=True, temperature=0.1, top_k=50, repetition_penalty=1.05, streamer=streamer, max_new_tokens=512)
@@ -180,16 +184,18 @@
 ]
 
 # Apply chat templates and tokenize
-batch = tokenizer.apply_chat_template(
+inputs = tokenizer.apply_chat_template(
     prompts,
     add_generation_prompt=True,
     return_tensors="pt",
     tokenize=True,
     padding=True,
-).to(model.device)
+    return_dict=True,
+)
+inputs = {k: v.to(model.device) for k, v in inputs.items()}
 
 # Generate for all prompts in batch
-outputs = model.generate(batch, do_sample=True, temperature=0.1, top_k=50, repetition_penalty=1.05, max_new_tokens=512)
+outputs = model.generate(**inputs, do_sample=True, temperature=0.1, top_k=50, repetition_penalty=1.05, max_new_tokens=512)
 
 # Decode outputs
 for output in outputs:

@@ -59,12 +59,14 @@
                  ")\n"
                  "tokenizer = AutoTokenizer.from_pretrained(model_id)\n"
                  "\n"
-                 "input_ids = tokenizer.apply_chat_template(\n"
+                 "inputs = tokenizer.apply_chat_template(\n"
                  '    [{"role": "user", "content": "What is machine learning?"}],\n'
                  "    add_generation_prompt=True,\n"
                  '    return_tensors="pt",\n'
                  "    tokenize=True,\n"
-                 ").to(model.device)\n"
+                 "    return_dict=True,\n"
+                 ")\n"
+                 'input_ids = inputs["input_ids"].to(model.device)\n'
                  "\n"
                  "output = model.generate(input_ids, ${samplingParams}max_new_tokens=512)\n"
                  "response = tokenizer.decode(output[0][len(input_ids[0]):], skip_special_tokens=True)\n"

@@ -19,12 +19,14 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 
-input_ids = tokenizer.apply_chat_template(
+inputs = tokenizer.apply_chat_template(
     [{"role": "user", "content": "What is machine learning?"}],
     add_generation_prompt=True,
     return_tensors="pt",
     tokenize=True,
-).to(model.device)
+    return_dict=True,
+)
+input_ids = inputs["input_ids"].to(model.device)
 
 output = model.generate(input_ids, ${samplingParams}max_new_tokens=512)
 response = tokenizer.decode(output[0][len(input_ids[0]):], skip_special_tokens=True)