diff --git a/deployment/gpu-inference/transformers.mdx b/deployment/gpu-inference/transformers.mdx index 2dd9870..0aeb91d 100644 --- a/deployment/gpu-inference/transformers.mdx +++ b/deployment/gpu-inference/transformers.mdx @@ -53,12 +53,14 @@ tokenizer = AutoTokenizer.from_pretrained(model_id) # Generate answer prompt = "What is C. elegans?" -input_ids = tokenizer.apply_chat_template( +inputs = tokenizer.apply_chat_template( [{"role": "user", "content": prompt}], add_generation_prompt=True, return_tensors="pt", tokenize=True, -).to(model.device) + return_dict=True, +) +input_ids = inputs["input_ids"].to(model.device) output = model.generate(input_ids, do_sample=True, temperature=0.1, top_k=50, repetition_penalty=1.05, max_new_tokens=512) @@ -149,12 +151,14 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer # Use the model and tokenizer setup from Basic Usage above prompt = "Tell me a story about space exploration." -input_ids = tokenizer.apply_chat_template( +inputs = tokenizer.apply_chat_template( [{"role": "user", "content": prompt}], add_generation_prompt=True, return_tensors="pt", tokenize=True, -).to(model.device) + return_dict=True, +) +input_ids = inputs["input_ids"].to(model.device) streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) output = model.generate(input_ids, do_sample=True, temperature=0.1, top_k=50, repetition_penalty=1.05, streamer=streamer, max_new_tokens=512) @@ -180,16 +184,18 @@ prompts = [ ] # Apply chat templates and tokenize -batch = tokenizer.apply_chat_template( +inputs = tokenizer.apply_chat_template( prompts, add_generation_prompt=True, return_tensors="pt", tokenize=True, padding=True, -).to(model.device) + return_dict=True, +) +inputs = {k: v.to(model.device) for k, v in inputs.items()} # Generate for all prompts in batch -outputs = model.generate(batch, do_sample=True, temperature=0.1, top_k=50, repetition_penalty=1.05, max_new_tokens=512) +outputs = model.generate(**inputs, do_sample=True, temperature=0.1, top_k=50, repetition_penalty=1.05, max_new_tokens=512) # Decode outputs for output in outputs: diff --git a/scripts/generate_snippets.py b/scripts/generate_snippets.py index 5729110..e8e1e66 100644 --- a/scripts/generate_snippets.py +++ b/scripts/generate_snippets.py @@ -59,12 +59,14 @@ ")\n" "tokenizer = AutoTokenizer.from_pretrained(model_id)\n" "\n" - "input_ids = tokenizer.apply_chat_template(\n" + "inputs = tokenizer.apply_chat_template(\n" ' [{"role": "user", "content": "What is machine learning?"}],\n' " add_generation_prompt=True,\n" ' return_tensors="pt",\n' " tokenize=True,\n" - ").to(model.device)\n" + " return_dict=True,\n" + ")\n" + 'input_ids = inputs["input_ids"].to(model.device)\n' "\n" "output = model.generate(input_ids, ${samplingParams}max_new_tokens=512)\n" "response = tokenizer.decode(output[0][len(input_ids[0]):], skip_special_tokens=True)\n" diff --git a/snippets/quickstart/text-transformers.mdx b/snippets/quickstart/text-transformers.mdx index 8affb7b..a47c498 100644 --- a/snippets/quickstart/text-transformers.mdx +++ b/snippets/quickstart/text-transformers.mdx @@ -19,12 +19,14 @@ model = AutoModelForCausalLM.from_pretrained( ) tokenizer = AutoTokenizer.from_pretrained(model_id) -input_ids = tokenizer.apply_chat_template( +inputs = tokenizer.apply_chat_template( [{"role": "user", "content": "What is machine learning?"}], add_generation_prompt=True, return_tensors="pt", tokenize=True, -).to(model.device) + return_dict=True, +) +input_ids = inputs["input_ids"].to(model.device) output = model.generate(input_ids, ${samplingParams}max_new_tokens=512) response = tokenizer.decode(output[0][len(input_ids[0]):], skip_special_tokens=True)