From 38aebcee3e0280ae7a572e2d3af0eb15e1036920 Mon Sep 17 00:00:00 2001 From: Kyrian Obikwelu Date: Wed, 4 Feb 2026 11:10:39 +0100 Subject: [PATCH 1/2] feat: use Hugging Face PHP for Hub loading MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace custom HubLoader HTTP and cache logic with huggingface-php RepoManager - Use snapshot()+manifest to cache tokenizer.json and tokenizer_config.json efficiently - Remove TOKENIZERS_CACHE handling and custom cache-dir resolution in favor of standard HF cache - Simplify README highlights and cache documentation to match new behavior - Trim old, noisy examples and add a single tokenization_overview.php sample - Update composer requirements to depend on codewithkyrian/huggingface BREAKING CHANGE: - TOKENIZERS_CACHE is no longer read; use cacheDir or HF_HUB_CACHE/HF_HOME instead. - Hub loading behavior and cache layout now follow huggingface-php’s unified cache. --- README.md | 21 +- composer.json | 6 +- examples/context_window_fit_analysis.php | 229 --------- examples/document_chunking_pipeline.php | 247 --------- examples/semantic_search_embeddings.php | 93 ---- .../text_classification_preprocessing.php | 236 --------- examples/tokenization_overview.php | 59 +++ src/Loaders/HubLoader.php | 484 +----------------- 8 files changed, 96 insertions(+), 1279 deletions(-) delete mode 100644 examples/context_window_fit_analysis.php delete mode 100644 examples/document_chunking_pipeline.php delete mode 100644 examples/semantic_search_embeddings.php delete mode 100644 examples/text_classification_preprocessing.php create mode 100644 examples/tokenization_overview.php diff --git a/README.md b/README.md index 7675fbd..84f40c4 100644 --- a/README.md +++ b/README.md @@ -15,8 +15,8 @@ ## Highlights - **Pure PHP** — No FFI, no external binaries, no compiled extensions. Works everywhere PHP runs. -- **Zero Hard Dependencies** — Core tokenization has no required dependencies. Optional HTTP client needed only for Hub downloads. -- **Hub Compatible** — Load tokenizers directly from Hugging Face Hub or from local files. +- **Hub Integration** — Load tokenizers from Hugging Face Hub with smart caching and manifest-based file checks. +- **Flexible Loading** — Load from local files, config arrays, or build custom tokenizers with the builder API. - **Fully Tested** — Validated against BERT, GPT-2, Llama, Gemma, Qwen, RoBERTa, ALBERT, and more. - **Modern PHP** — Built for PHP 8.2+ with strict types, readonly classes, and clean interfaces. @@ -28,16 +28,14 @@ Install via Composer: composer require codewithkyrian/tokenizers ``` -### HTTP Client (Optional) +### HTTP Client (for Hub loading) -If you plan to load tokenizers from the Hugging Face Hub, you'll need an HTTP client implementing PSR-18. We recommend Guzzle: +Loading tokenizers from the Hugging Face Hub requires an HTTP client. We recommend Guzzle: ```bash composer require guzzlehttp/guzzle ``` -> **Note:** The library uses [PHP-HTTP Discovery](https://github.com/php-http/discovery) to automatically find and use any PSR-18 compatible HTTP client installed in your project. If you're only loading tokenizers from local files, no HTTP client is needed. - ## Quick Start ```php @@ -96,10 +94,13 @@ $tokenizer = Tokenizer::fromHub( When `cacheDir` is not specified, the library automatically resolves the cache location: -1. **Environment Variable** — `TOKENIZERS_CACHE` if set -2. **macOS** — `~/Library/Caches/huggingface/tokenizers` -3. **Linux** — `$XDG_CACHE_HOME/huggingface/tokenizers` or `~/.cache/huggingface/tokenizers` -4. **Windows** — `%LOCALAPPDATA%\huggingface\tokenizers` +1. **HF_HUB_CACHE** — if set, used directly +2. **HF_HOME** — if set, `$HF_HOME/hub` +3. **macOS** — `~/Library/Caches/huggingface/hub` +4. **Linux** — `$XDG_CACHE_HOME/huggingface/hub` or `~/.cache/huggingface/hub` +5. **Windows** — `%LOCALAPPDATA%\huggingface\hub` + +Pass `cacheDir` to use a custom directory. ### From Local Files diff --git a/composer.json b/composer.json index 457685d..d2ac0ca 100644 --- a/composer.json +++ b/composer.json @@ -32,9 +32,7 @@ }, "require": { "php": "^8.2", - "psr/http-client": "^1.0", - "psr/http-factory": "^1.0", - "php-http/discovery": "^1.19" + "codewithkyrian/huggingface": "^1.0" }, "require-dev": { "friendsofphp/php-cs-fixer": "^3.91", @@ -56,4 +54,4 @@ "cs:check": "vendor/bin/php-cs-fixer fix --dry-run --diff", "analyse": "vendor/bin/phpstan analyse -c phpstan.dist.neon" } -} \ No newline at end of file +} diff --git a/examples/context_window_fit_analysis.php b/examples/context_window_fit_analysis.php deleted file mode 100644 index e5f6101..0000000 --- a/examples/context_window_fit_analysis.php +++ /dev/null @@ -1,229 +0,0 @@ -modelMaxLength) - * - * NOTE: Chat templates are typically Jinja templates stored in the tokenizer config. - * You can install `codewithkyrian/jinja-php` to parse and render them dynamically: - * - * composer require codewithkyrian/jinja-php - * - * Here we manually implement each model's chat template format. - */ - -require __DIR__.'/../vendor/autoload.php'; - -use Codewithkyrian\Tokenizers\Tokenizer; - -/** - * Chat template implementations for different model families. - * Each model uses a different format for structuring conversations. - */ - -/** - * ChatML format (Qwen, Yi, GPT-4o, etc.) - * Template: <|im_start|>role\ncontent<|im_end|>. - */ -function applyChatMLTemplate(array $messages, bool $addGenerationPrompt = true): string -{ - $formatted = ''; - - foreach ($messages as $message) { - $formatted .= "<|im_start|>{$message['role']}\n{$message['content']}<|im_end|>\n"; - } - - if ($addGenerationPrompt) { - $formatted .= "<|im_start|>assistant\n"; - } - - return $formatted; -} - -/** - * Llama 3 format - * Template: <|start_header_id|>role<|end_header_id|>\n\ncontent<|eot_id|>. - */ -function applyLlama3Template(array $messages, bool $addGenerationPrompt = true): string -{ - $formatted = '<|begin_of_text|>'; - - foreach ($messages as $message) { - $formatted .= "<|start_header_id|>{$message['role']}<|end_header_id|>\n\n"; - $formatted .= "{$message['content']}<|eot_id|>"; - } - - if ($addGenerationPrompt) { - $formatted .= "<|start_header_id|>assistant<|end_header_id|>\n\n"; - } - - return $formatted; -} - -/** - * Claude format (Anthropic) - * Template: \n\nHuman: content\n\nAssistant: content. - */ -function applyClaudeTemplate(array $messages, bool $addGenerationPrompt = true): string -{ - $formatted = ''; - - foreach ($messages as $message) { - $role = match ($message['role']) { - 'system' => 'Human', // Claude handles system via system parameter, but we include it here - 'user' => 'Human', - 'assistant' => 'Assistant', - default => $message['role'], - }; - - // System messages are typically prepended to the first human message - if ('system' === $message['role']) { - $formatted .= "{$message['content']}\n\n"; - - continue; - } - - $formatted .= "\n\n{$role}: {$message['content']}"; - } - - if ($addGenerationPrompt) { - $formatted .= "\n\nAssistant:"; - } - - return trim($formatted); -} - -$models = [ - 'Qwen2-1.5B-Instruct' => [ - 'hub_id' => 'Qwen/Qwen2-1.5B-Instruct', - 'template_fn' => 'applyChatMLTemplate', - ], - 'Llama-3' => [ - 'hub_id' => 'Xenova/llama3-tokenizer', - 'template_fn' => 'applyLlama3Template', - 'context_window' => 8192, - ], - 'GPT-4o' => [ - 'hub_id' => 'Xenova/gpt-4o', - 'template_fn' => 'applyChatMLTemplate', - ], - 'Claude-Sonnet-4' => [ - 'hub_id' => 'Xenova/claude-tokenizer', - 'template_fn' => 'applyClaudeTemplate', - ], - 'Grok-1' => [ - 'hub_id' => 'Xenova/grok-1-tokenizer', - 'template_fn' => 'applyChatMLTemplate', - ], - 'DeepSeek-V3.2' => [ - 'hub_id' => 'deepseek-ai/DeepSeek-V3.2', - 'template_fn' => 'applyChatMLTemplate', - ], -]; - -$conversation = [ - [ - 'role' => 'system', - 'content' => 'You are a helpful customer support assistant for TechCorp. Be concise, professional, and always offer to escalate complex issues to a human agent.', - ], - [ - 'role' => 'user', - 'content' => 'Hi, I purchased a laptop last week and the screen keeps flickering. I\'ve tried restarting it multiple times but the issue persists.', - ], - [ - 'role' => 'assistant', - 'content' => 'I\'m sorry to hear about the screen flickering issue with your new laptop. This could be caused by a few things - a driver issue, loose display cable, or a hardware defect. Let me help you troubleshoot. First, could you tell me the laptop model and whether you\'ve updated the graphics drivers recently?', - ], - [ - 'role' => 'user', - 'content' => 'It\'s the TechCorp ProBook 15. I haven\'t updated any drivers since I got it. The flickering happens randomly, sometimes every few minutes, sometimes it doesn\'t happen for hours.', - ], - [ - 'role' => 'assistant', - 'content' => 'Thank you for those details. The intermittent nature suggests it might be a driver or software issue rather than hardware. Let\'s try updating your graphics drivers first. Go to Settings > Windows Update > Check for updates, and also check for optional driver updates. If that doesn\'t resolve it within 24 hours, we can arrange a diagnostic or replacement under your warranty.', - ], - [ - 'role' => 'user', - 'content' => 'Okay, I\'ll try that. If it doesn\'t work, how do I arrange the diagnostic? And will I lose my data?', - ], -]; - -echo "=== Context Window Fit Analysis ===\n\n"; -echo "Analyzing the same conversation across different LLM tokenizers.\n"; -echo 'Messages: '.count($conversation)."\n\n"; - -$results = []; - -foreach ($models as $modelName => $config) { - echo "Loading {$modelName}...\n"; - - $tokenizer = Tokenizer::fromHub($config['hub_id']); - $templateFn = $config['template_fn']; - $formattedPrompt = $templateFn($conversation); - $encoding = $tokenizer->encode($formattedPrompt); - $tokenCount = count($encoding->ids); - $contextWindow = $config['context_window'] ?? $tokenizer->modelMaxLength; - - $results[$modelName] = [ - 'tokens' => $tokenCount, - 'context_window' => $contextWindow, - 'percent_used' => $contextWindow ? round(($tokenCount / $contextWindow) * 100, 2) : null, - 'remaining' => $contextWindow ? $contextWindow - $tokenCount : null, - 'fits' => $contextWindow ? $tokenCount <= $contextWindow : null, - ]; -} - -echo "\n".str_repeat('=', 85)."\n"; -echo "RESULTS\n"; -echo str_repeat('=', 85)."\n\n"; - -// Header -printf( - "%-20s │ %10s │ %15s │ %8s │ %15s │ %s\n", - 'Model', - 'Tokens', - 'Context Window', - 'Used %', - 'Remaining', - 'Status' -); -echo str_repeat('─', 85)."\n"; - -foreach ($results as $modelName => $data) { - $contextStr = null !== $data['context_window'] - ? number_format($data['context_window']) - : 'N/A'; - - $percentStr = null !== $data['percent_used'] - ? $data['percent_used'].'%' - : 'N/A'; - - $remainingStr = null !== $data['remaining'] - ? number_format($data['remaining']) - : 'N/A'; - - $status = match (true) { - null === $data['fits'] => '? Unknown', - $data['fits'] => '✓ Fits', - default => '✗ Exceeds', - }; - - printf( - "%-20s │ %10s │ %15s │ %8s │ %15s │ %s\n", - $modelName, - number_format($data['tokens']), - $contextStr, - $percentStr, - $remainingStr, - $status - ); -} - -echo str_repeat('─', 85)."\n\n"; diff --git a/examples/document_chunking_pipeline.php b/examples/document_chunking_pipeline.php deleted file mode 100644 index 9abde6a..0000000 --- a/examples/document_chunking_pipeline.php +++ /dev/null @@ -1,247 +0,0 @@ - [ - 'max_tokens' => 256, - 'overlap_tokens' => 50, - 'description' => 'For sentence-transformer embeddings', - ], - 'summarization_model' => [ - 'max_tokens' => 512, - 'overlap_tokens' => 100, - 'description' => 'For T5/BART summarization', - ], - 'llm_context' => [ - 'max_tokens' => 1024, - 'overlap_tokens' => 200, - 'description' => 'For LLM context windows', - ], -]; - -echo "=== Document Chunking Pipeline Example ===\n\n"; - -// Load tokenizer -$tokenizer = Tokenizer::fromHub('bert-base-uncased'); - -echo "Tokenizer loaded: bert-base-uncased\n\n"; - -// Analyze the full document first -$fullEncoding = $tokenizer->encode($longDocument); -$totalTokens = count($fullEncoding->ids); - -echo "--- Document Analysis ---\n\n"; -echo 'Total characters: '.mb_strlen($longDocument)."\n"; -echo "Total tokens: {$totalTokens}\n"; -echo 'Average tokens per character: '.round($totalTokens / mb_strlen($longDocument), 3)."\n\n"; - -/** - * Split text into sentences (simple implementation). - */ -function splitIntoSentences(string $text): array -{ - // Split on sentence-ending punctuation followed by space or end - $sentences = preg_split('/(?<=[.!?])\s+/', $text, -1, \PREG_SPLIT_NO_EMPTY); - - return array_map('trim', $sentences); -} - -/** - * Create token-aware chunks from a document. - * - * @param int $maxTokens Maximum tokens per chunk (excluding special tokens) - * @param int $overlapTokens Number of tokens to overlap between chunks - * - * @return array{chunks: array, metadata: array} - */ -function createChunks(Tokenizer $tokenizer, string $text, int $maxTokens, int $overlapTokens): array -{ - $sentences = splitIntoSentences($text); - $chunks = []; - $metadata = []; - - $currentChunk = []; - $currentTokenCount = 0; - $chunkStartIndex = 0; - - foreach ($sentences as $sentenceIndex => $sentence) { - $sentenceEncoding = $tokenizer->encode($sentence, addSpecialTokens: false); - $sentenceTokens = count($sentenceEncoding->ids); - - // If single sentence exceeds max, we need to split it (edge case) - if ($sentenceTokens > $maxTokens) { - // Flush current chunk first - if (!empty($currentChunk)) { - $chunkText = implode(' ', $currentChunk); - $chunks[] = $chunkText; - $metadata[] = [ - 'chunk_index' => count($chunks) - 1, - 'sentence_range' => [$chunkStartIndex, $sentenceIndex - 1], - 'token_count' => $currentTokenCount, - ]; - } - - // Add the long sentence as its own chunk (will be truncated by model) - $chunks[] = $sentence; - $metadata[] = [ - 'chunk_index' => count($chunks) - 1, - 'sentence_range' => [$sentenceIndex, $sentenceIndex], - 'token_count' => $sentenceTokens, - 'warning' => 'Sentence exceeds max tokens, may be truncated', - ]; - - $currentChunk = []; - $currentTokenCount = 0; - $chunkStartIndex = $sentenceIndex + 1; - - continue; - } - - // Check if adding this sentence would exceed the limit - if ($currentTokenCount + $sentenceTokens > $maxTokens && !empty($currentChunk)) { - // Save current chunk - $chunkText = implode(' ', $currentChunk); - $chunks[] = $chunkText; - $metadata[] = [ - 'chunk_index' => count($chunks) - 1, - 'sentence_range' => [$chunkStartIndex, $sentenceIndex - 1], - 'token_count' => $currentTokenCount, - ]; - - // Calculate overlap: include last N tokens worth of sentences - $overlapSentences = []; - $overlapCount = 0; - for ($i = count($currentChunk) - 1; $i >= 0 && $overlapCount < $overlapTokens; --$i) { - $sentEnc = $tokenizer->encode($currentChunk[$i], addSpecialTokens: false); - $overlapCount += count($sentEnc->ids); - array_unshift($overlapSentences, $currentChunk[$i]); - } - - $currentChunk = $overlapSentences; - $currentTokenCount = $overlapCount; - $chunkStartIndex = $sentenceIndex - count($overlapSentences); - } - - $currentChunk[] = $sentence; - $currentTokenCount += $sentenceTokens; - } - - // Don't forget the last chunk - if (!empty($currentChunk)) { - $chunkText = implode(' ', $currentChunk); - $chunks[] = $chunkText; - $metadata[] = [ - 'chunk_index' => count($chunks) - 1, - 'sentence_range' => [$chunkStartIndex, count($sentences) - 1], - 'token_count' => $currentTokenCount, - ]; - } - - return ['chunks' => $chunks, 'metadata' => $metadata]; -} - -// Process document with different configurations -foreach ($chunkConfigs as $configName => $config) { - echo "--- Chunking for: {$config['description']} ---\n"; - echo "Max tokens: {$config['max_tokens']}, Overlap: {$config['overlap_tokens']}\n\n"; - - $result = createChunks( - $tokenizer, - $longDocument, - $config['max_tokens'], - $config['overlap_tokens'] - ); - - echo 'Created '.count($result['chunks'])." chunks:\n\n"; - - foreach ($result['chunks'] as $index => $chunk) { - $meta = $result['metadata'][$index]; - $preview = mb_substr($chunk, 0, 80); - - echo 'Chunk '.($index + 1).":\n"; - echo " Tokens: {$meta['token_count']}\n"; - echo " Sentences: {$meta['sentence_range'][0]}-{$meta['sentence_range'][1]}\n"; - echo " Preview: \"{$preview}...\"\n"; - - if (isset($meta['warning'])) { - echo " ⚠️ {$meta['warning']}\n"; - } - echo "\n"; - } - - echo str_repeat('-', 60)."\n\n"; -} - -// Demonstrate verification: all text should be recoverable from chunks (minus overlaps) -echo "--- Chunk Verification ---\n\n"; - -$config = $chunkConfigs['embedding_model']; -$result = createChunks($tokenizer, $longDocument, $config['max_tokens'], $config['overlap_tokens']); - -// Verify each chunk encodes correctly -$allValid = true; -foreach ($result['chunks'] as $index => $chunk) { - $encoding = $tokenizer->encode($chunk); - $decoded = $tokenizer->decode($encoding->ids, skipSpecialTokens: true); - - // Simple check: decoded text should contain key words from original - $originalWords = array_slice(explode(' ', $chunk), 0, 5); - $decodedWords = explode(' ', $decoded); - - $matchCount = count(array_filter($originalWords, static fn ($w) => in_array(strtolower($w), array_map('strtolower', $decodedWords)))); - - if ($matchCount < 3) { - echo '⚠️ Chunk '.($index + 1)." may have encoding issues\n"; - $allValid = false; - } -} - -if ($allValid) { - echo "✓ All chunks verified - encode/decode roundtrip successful\n"; -} - -echo "\nPipeline complete! Chunks are ready for embedding or processing.\n"; diff --git a/examples/semantic_search_embeddings.php b/examples/semantic_search_embeddings.php deleted file mode 100644 index 220f9a9..0000000 --- a/examples/semantic_search_embeddings.php +++ /dev/null @@ -1,93 +0,0 @@ - 'doc_001', - 'title' => 'Introduction to Machine Learning', - 'content' => 'Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed. It focuses on developing algorithms that can access data and use it to learn for themselves.', - ], - [ - 'id' => 'doc_002', - 'title' => 'Neural Networks Explained', - 'content' => 'Neural networks are computing systems inspired by biological neural networks in the human brain. They consist of interconnected nodes organized in layers that process information using connectionist approaches to computation.', - ], - [ - 'id' => 'doc_003', - 'title' => 'Natural Language Processing', - 'content' => 'NLP combines computational linguistics with statistical, machine learning, and deep learning models. It enables computers to process and analyze large amounts of natural language data, from text classification to machine translation.', - ], - [ - 'id' => 'doc_004', - 'title' => 'Computer Vision Applications', - 'content' => 'Computer vision trains machines to interpret and understand visual information from the world. Applications range from facial recognition and autonomous vehicles to medical image analysis and industrial quality control.', - ], -]; - -$searchQueries = [ - 'How do computers learn from data?', - 'What are the layers in AI systems?', - 'Processing human language with AI', -]; - -echo "=== Semantic Search Tokenization Example ===\n\n"; - -$tokenizer = Tokenizer::fromHub('sentence-transformers/all-MiniLM-L6-v2'); - -echo "Tokenizer loaded: sentence-transformers/all-MiniLM-L6-v2\n\n"; - -echo "--- Processing Documents ---\n\n"; - -$maxTokenLength = 256; - -foreach ($documents as $doc) { - $text = $doc['title'].'. '.$doc['content']; - $encoding = $tokenizer->encode($text); - - $tokenCount = count($encoding->ids); - $truncated = $tokenCount > $maxTokenLength; - - echo "Document: {$doc['id']}\n"; - echo " Title: {$doc['title']}\n"; - echo " Token count: {$tokenCount}".($truncated ? ' (would need truncation)' : '')."\n"; - echo ' First 10 tokens: '.implode(', ', array_slice($encoding->tokens, 0, 10))."...\n"; - echo ' First 10 IDs: ['.implode(', ', array_slice($encoding->ids, 0, 10))."...]\n\n"; -} - -echo "--- Processing Search Queries ---\n\n"; - -foreach ($searchQueries as $index => $query) { - $encoding = $tokenizer->encode($query); - - echo 'Query '.($index + 1).": \"{$query}\"\n"; - echo ' Tokens: '.implode(', ', $encoding->tokens)."\n"; - echo ' IDs: ['.implode(', ', $encoding->ids)."]\n"; - echo ' Token count: '.count($encoding->ids)."\n\n"; -} - -echo "--- Round-trip Verification ---\n\n"; - -$testText = 'Machine learning enables pattern recognition in data.'; -$encoding = $tokenizer->encode($testText); -$decoded = $tokenizer->decode($encoding->ids); - -echo "Original: {$testText}\n"; -echo 'Encoded: ['.implode(', ', $encoding->ids)."]\n"; -echo "Decoded: {$decoded}\n"; diff --git a/examples/text_classification_preprocessing.php b/examples/text_classification_preprocessing.php deleted file mode 100644 index a75c423..0000000 --- a/examples/text_classification_preprocessing.php +++ /dev/null @@ -1,236 +0,0 @@ - 'Absolutely love this product! The quality exceeded my expectations and shipping was incredibly fast. Will definitely buy again.', - 'label' => 'positive', - ], - [ - 'text' => 'Decent product for the price. Does what it\'s supposed to do, nothing more nothing less. Packaging could be better.', - 'label' => 'neutral', - ], - [ - 'text' => 'Complete waste of money. Arrived broken, customer service was unhelpful, and the return process took forever. Avoid!', - 'label' => 'negative', - ], - [ - 'text' => 'The device worked fine for about two weeks, then suddenly stopped charging. Replacement had the same issue. Very frustrating experience.', - 'label' => 'negative', - ], -]; - -// Sample data for Natural Language Inference (premise + hypothesis pairs) -$nliSamples = [ - [ - 'premise' => 'A man is playing a guitar on stage in front of a large crowd.', - 'hypothesis' => 'A musician is performing at a concert.', - 'label' => 'entailment', - ], - [ - 'premise' => 'A man is playing a guitar on stage in front of a large crowd.', - 'hypothesis' => 'A man is sleeping in his bedroom.', - 'label' => 'contradiction', - ], - [ - 'premise' => 'A man is playing a guitar on stage in front of a large crowd.', - 'hypothesis' => 'The man is a professional musician.', - 'label' => 'neutral', - ], -]; - -// Sample data for question answering context matching -$qaSamples = [ - [ - 'question' => 'What is the capital of France?', - 'context' => 'France is a country in Western Europe. Its capital city is Paris, which is known for the Eiffel Tower and the Louvre Museum.', - ], - [ - 'question' => 'When was the company founded?', - 'context' => 'TechCorp was founded in 2010 by Jane Smith and John Doe. The company started in a small garage in Silicon Valley and has since grown to over 5000 employees worldwide.', - ], -]; - -echo "=== Text Classification Preprocessing Example ===\n\n"; - -// Load BERT tokenizer - the standard for many classification tasks -$tokenizer = Tokenizer::fromHub('bert-base-uncased'); - -echo "Tokenizer loaded: bert-base-uncased\n"; -echo "Max sequence length for BERT: 512 tokens\n\n"; - -// ============================================ -// SINGLE SEQUENCE: Sentiment Analysis -// ============================================ - -echo "--- Sentiment Analysis (Single Sequence) ---\n\n"; - -$maxLength = 128; // Typical for classification tasks - -foreach ($sentimentSamples as $index => $sample) { - $encoding = $tokenizer->encode($sample['text']); - - $tokenCount = count($encoding->ids); - $needsPadding = $tokenCount < $maxLength; - $needsTruncation = $tokenCount > $maxLength; - - echo 'Sample '.($index + 1)." [{$sample['label']}]:\n"; - echo ' Text: "'.mb_substr($sample['text'], 0, 60)."...\"\n"; - echo " Token count: {$tokenCount}\n"; - - // Show BERT's special token structure - echo " Structure: [CLS] ... text tokens ... [SEP]\n"; - echo ' First 5: '.implode(' ', array_slice($encoding->tokens, 0, 5))."\n"; - echo ' Last 3: '.implode(' ', array_slice($encoding->tokens, -3))."\n"; - - if ($needsPadding) { - $paddingNeeded = $maxLength - $tokenCount; - echo " Padding needed: {$paddingNeeded} [PAD] tokens\n"; - } - if ($needsTruncation) { - $truncateCount = $tokenCount - $maxLength; - echo " Truncation needed: remove {$truncateCount} tokens\n"; - } - - echo "\n"; -} - -// ============================================ -// SENTENCE PAIRS: Natural Language Inference -// ============================================ - -echo "--- Natural Language Inference (Sentence Pairs) ---\n\n"; - -foreach ($nliSamples as $index => $sample) { - // BERT uses textPair for sentence pair tasks - $encoding = $tokenizer->encode( - text: $sample['premise'], - textPair: $sample['hypothesis'], - addSpecialTokens: true - ); - - echo 'Sample '.($index + 1)." [{$sample['label']}]:\n"; - echo " Premise: \"{$sample['premise']}\"\n"; - echo " Hypothesis: \"{$sample['hypothesis']}\"\n"; - echo ' Token count: '.count($encoding->ids)."\n"; - - // Show the structure with type IDs - echo " Structure: [CLS] premise [SEP] hypothesis [SEP]\n"; - - // Type IDs distinguish between premise (0) and hypothesis (1) - $segment0Count = count(array_filter($encoding->typeIds, static fn ($t) => 0 === $t)); - $segment1Count = count(array_filter($encoding->typeIds, static fn ($t) => 1 === $t)); - - echo " Segment A (premise) tokens: {$segment0Count}\n"; - echo " Segment B (hypothesis) tokens: {$segment1Count}\n"; - echo ' Type IDs sample: ['.implode(', ', array_slice($encoding->typeIds, 0, 10))."...]\n\n"; -} - -// ============================================ -// QUESTION ANSWERING PAIRS -// ============================================ - -echo "--- Question Answering (Question + Context) ---\n\n"; - -foreach ($qaSamples as $index => $sample) { - $encoding = $tokenizer->encode( - text: $sample['question'], - textPair: $sample['context'], - addSpecialTokens: true - ); - - echo 'Sample '.($index + 1).":\n"; - echo " Question: \"{$sample['question']}\"\n"; - echo ' Context: "'.mb_substr($sample['context'], 0, 80)."...\"\n"; - echo ' Total tokens: '.count($encoding->ids)."\n"; - - // Find where the context starts (after second segment begins) - $contextStart = array_search(1, $encoding->typeIds); - echo " Question tokens: {$contextStart}\n"; - echo ' Context tokens: '.(count($encoding->ids) - $contextStart)."\n\n"; -} - -// ============================================ -// BATCH PROCESSING HELPER -// ============================================ - -echo "--- Batch Preprocessing Helper ---\n\n"; - -/** - * Preprocess a batch of texts for classification. - * In production, you'd send these to your model. - * - * @return array{input_ids: array, attention_mask: array, token_type_ids: array} - */ -function preprocessBatch(Tokenizer $tokenizer, array $texts, int $maxLength = 128): array -{ - $batchInputIds = []; - $batchAttentionMask = []; - $batchTokenTypeIds = []; - - foreach ($texts as $text) { - $encoding = $tokenizer->encode($text); - - $ids = $encoding->ids; - $typeIds = $encoding->typeIds; - - // Truncate if needed - if (count($ids) > $maxLength) { - $ids = array_slice($ids, 0, $maxLength - 1); - $ids[] = 102; // [SEP] token ID for BERT - $typeIds = array_slice($typeIds, 0, $maxLength); - } - - // Pad if needed - $paddingLength = $maxLength - count($ids); - $attentionMask = array_merge( - array_fill(0, count($ids), 1), - array_fill(0, $paddingLength, 0) - ); - - $ids = array_merge($ids, array_fill(0, $paddingLength, 0)); // [PAD] = 0 - $typeIds = array_merge($typeIds, array_fill(0, $paddingLength, 0)); - - $batchInputIds[] = $ids; - $batchAttentionMask[] = $attentionMask; - $batchTokenTypeIds[] = $typeIds; - } - - return [ - 'input_ids' => $batchInputIds, - 'attention_mask' => $batchAttentionMask, - 'token_type_ids' => $batchTokenTypeIds, - ]; -} - -// Process the sentiment samples as a batch -$texts = array_column($sentimentSamples, 'text'); -$batch = preprocessBatch($tokenizer, $texts, 64); - -echo 'Batch processed: '.count($texts)." samples\n"; -echo "Each padded/truncated to: 64 tokens\n"; -echo "Output shapes:\n"; -echo ' input_ids: ['.count($batch['input_ids']).', '.count($batch['input_ids'][0])."]\n"; -echo ' attention_mask: ['.count($batch['attention_mask']).', '.count($batch['attention_mask'][0])."]\n"; -echo ' token_type_ids: ['.count($batch['token_type_ids']).', '.count($batch['token_type_ids'][0])."]\n"; -echo "\nReady for model input!\n"; diff --git a/examples/tokenization_overview.php b/examples/tokenization_overview.php new file mode 100644 index 0000000..51fa527 --- /dev/null +++ b/examples/tokenization_overview.php @@ -0,0 +1,59 @@ + 'google-bert/bert-base-uncased', + 'GPT-2' => 'openai-community/gpt2', + 'Qwen3 Embedding' => 'Qwen/Qwen3-Embedding-0.6B', +]; + +$samples = [ + 'Short sentence' => 'Hello, how are you doing today?', + 'Code snippet' => 'function sum(int $a, int $b): int { return $a + $b; }', + 'Mixed content' => 'Paris is the capital of France. 42 🧠', +]; + +echo "=== Tokenizers PHP - Tokenization Overview ===\n\n"; + +foreach ($models as $label => $modelId) { + echo "Model: {$label}\n"; + echo "Hub ID: {$modelId}\n"; + + $tokenizer = Tokenizer::fromHub($modelId); + + foreach ($samples as $sampleLabel => $text) { + $encoding = $tokenizer->encode($text); + + $ids = $encoding->ids; + $tokens = $encoding->tokens; + + $count = \count($ids); + $idsPreview = implode(', ', array_slice($ids, 0, 10)); + $tokensPreview = implode(' ', array_slice($tokens, 0, 10)); + + echo "- {$sampleLabel}:\n"; + echo " Text: {$text}\n"; + echo " Token count: {$count}\n"; + echo " IDs (first 10): {$idsPreview}".($count > 10 ? ' ...' : '')."\n"; + echo " Tokens (first 10): {$tokensPreview}".($count > 10 ? ' ...' : '')."\n\n"; + } + + echo str_repeat('-', 60)."\n\n"; +} + diff --git a/src/Loaders/HubLoader.php b/src/Loaders/HubLoader.php index e6f189c..4529fe4 100644 --- a/src/Loaders/HubLoader.php +++ b/src/Loaders/HubLoader.php @@ -4,494 +4,58 @@ namespace Codewithkyrian\Tokenizers\Loaders; +use Codewithkyrian\HuggingFace\HuggingFace; use Codewithkyrian\Tokenizers\Contracts\ConfigLoaderInterface; -use Http\Discovery\Psr17FactoryDiscovery; -use Http\Discovery\Psr18ClientDiscovery; -use Psr\Http\Client\ClientInterface; -use Psr\Http\Message\RequestFactoryInterface; -use Psr\Http\Message\RequestInterface; -use Psr\Http\Message\ResponseInterface; -use Psr\Http\Message\UriFactoryInterface; +/** + * Loads tokenizer configuration from the Hugging Face Hub. + */ class HubLoader implements ConfigLoaderInterface { - protected const HF_ENDPOINT = 'https://huggingface.co'; - protected const TOKENIZERS_VERSION = '0.1.0'; - - protected ClientInterface $client; - protected RequestFactoryInterface $requestFactory; - protected UriFactoryInterface $uriFactory; - - protected ?string $resolvedCacheDir = null; + private const TOKENIZER_FILES = ['tokenizer.json', 'tokenizer_config.json']; public function __construct( protected ?string $cacheDir = null, - protected ?string $revision = 'main', + protected string $revision = 'main', protected ?string $token = null - ) { - $this->client = Psr18ClientDiscovery::find(); - $this->requestFactory = Psr17FactoryDiscovery::findRequestFactory(); - $this->uriFactory = Psr17FactoryDiscovery::findUriFactory(); - $this->resolvedCacheDir = $this->resolveCacheDir(); - } + ) {} public function load(string ...$source): array { if (0 === \count($source)) { - throw new \Exception('A model ID must be provided.'); + throw new \InvalidArgumentException('A model ID must be provided.'); } $modelId = $source[0]; - $encodedSource = implode('/', array_map('rawurlencode', explode('/', $modelId))); - $encodedRevision = rawurlencode($this->revision); - - $tokenizerUrl = \sprintf( - '%s/%s/resolve/%s/tokenizer.json', - self::HF_ENDPOINT, - $encodedSource, - $encodedRevision - ); - - $tokenizerConfigUrl = \sprintf( - '%s/%s/resolve/%s/tokenizer_config.json', - self::HF_ENDPOINT, - $encodedSource, - $encodedRevision - ); - - $bundle = $this->loadFromBundleCache($tokenizerUrl, $tokenizerConfigUrl); - if (null !== $bundle) { - return $bundle; - } - - [$tokenizerJson, $tokenizerPath, $tokenizerEtag] = $this->downloadJson($tokenizerUrl, 'tokenizer.json', $modelId); - [$tokenizerConfig, $tokenizerConfigPath, $tokenizerConfigEtag] = $this->downloadJson($tokenizerConfigUrl, 'tokenizer_config.json', $modelId, optional: true); + $factory = HuggingFace::factory(); - if (null !== $this->resolvedCacheDir) { - $this->cacheBundle( - $tokenizerUrl, - $tokenizerPath, - $tokenizerEtag, - $tokenizerConfigUrl, - $tokenizerConfigPath, - $tokenizerConfigEtag - ); + if (null !== $this->token) { + $factory = $factory->withToken($this->token); } - return $this->mergeConfigs($tokenizerJson, $tokenizerConfig); - } - - /** - * Sends a request and follows HTTP redirects until a non-redirect response is received. - * - * @param RequestInterface $request the initial request - * @param int $maxRedirects maximum number of redirects to follow (default: 5) - * - * @return ResponseInterface the final response after following redirects - * - * @throws \Exception if too many redirects occur or if a redirect fails - */ - protected function sendRequest(RequestInterface $request, int $maxRedirects = 5): ResponseInterface - { - $redirectCount = 0; - $currentRequest = $request; - - while ($redirectCount < $maxRedirects) { - $response = $this->client->sendRequest($currentRequest); - $statusCode = $response->getStatusCode(); - - // Check if this is a redirect (3xx status code) - if ($statusCode >= 300 && $statusCode < 400) { - $location = $response->getHeaderLine('Location'); - if (empty($location)) { - throw new \Exception('Received redirect response without Location header'); - } - - if (preg_match('/^https?:\/\//', $location)) { - $parsed = parse_url($location); - if (false !== $parsed) { - $uri = $this->uriFactory->createUri() - ->withScheme($parsed['scheme'] ?? 'https') - ->withHost($parsed['host'] ?? '') - ->withPort($parsed['port'] ?? null) - ->withPath($parsed['path'] ?? '/') - ->withQuery($parsed['query'] ?? '') - ->withFragment($parsed['fragment'] ?? '') - ; - - $currentRequest = $this->requestFactory->createRequest('GET', $uri); - } else { - $currentRequest = $this->requestFactory->createRequest('GET', $location); - } - } else { - $parsed = parse_url($location); - - $newUri = $currentRequest->getUri() - ->withQuery($parsed['query'] ?? '') - ->withFragment($parsed['fragment'] ?? '') - ; - - $locationPath = $parsed['path'] ?? $location; - - if (str_starts_with($location, '/')) { - $newUri = $newUri->withPath($locationPath); - } else { - $basePath = $newUri->getPath(); - $basePath = '.' === \dirname($basePath) ? '/' : \dirname($basePath); - $newUri = $newUri->withPath(rtrim($basePath, '/').'/'.$locationPath); - } - - $currentRequest = $this->requestFactory->createRequest('GET', $newUri); - } - - $currentRequest = $currentRequest - ->withHeader('User-Agent', 'tokenizers/'.self::TOKENIZERS_VERSION.'; PHP') - ; - - if ($this->token) { - $currentRequest = $currentRequest->withHeader('Authorization', "Bearer {$this->token}"); - } - - ++$redirectCount; - - continue; - } - - return $response; - } - - throw new \Exception("Too many redirects (max: {$maxRedirects})"); - } - - /** - * Resolves the cache directory based on OS. - * - * @return null|string the cache directory path, or null if it cannot be determined - */ - protected function resolveCacheDir(): ?string - { if (null !== $this->cacheDir) { - return $this->ensureCacheDir($this->cacheDir); + $factory = $factory->withCacheDir($this->cacheDir); } - $envCache = getenv('TOKENIZERS_CACHE'); - if (false !== $envCache) { - return $this->ensureCacheDir($envCache); - } - - $baseDir = $this->getOSCacheBaseDir(); - if (null === $baseDir) { - return null; - } - - $cacheDir = $baseDir.\DIRECTORY_SEPARATOR.'huggingface'.\DIRECTORY_SEPARATOR.'tokenizers'; - - return $this->ensureCacheDir($cacheDir); - } - - /** - * Gets the OS-specific base cache directory. - * - * @return null|string the base cache directory, or null if it cannot be determined - */ - protected function getOSCacheBaseDir(): ?string - { - if (\PHP_OS_FAMILY === 'Windows') { - $localAppData = getenv('LOCALAPPDATA'); + $hf = $factory->make(); - return false !== $localAppData ? $localAppData : null; - } - - if (\PHP_OS_FAMILY === 'Darwin') { - $home = getenv('HOME'); - - return false !== $home ? $home.\DIRECTORY_SEPARATOR.'Library'.\DIRECTORY_SEPARATOR.'Caches' : null; - } + $repo = $hf->hub() + ->repo($modelId) + ->revision($this->revision); - $xdgCache = getenv('XDG_CACHE_HOME'); - if (false !== $xdgCache) { - return $xdgCache; - } - - $home = getenv('HOME'); - - return false !== $home ? $home.\DIRECTORY_SEPARATOR.'.cache' : null; - } - - /** - * Ensures the cache directory exists and returns the path. - * - * @param string $dir the directory path - * - * @return null|string the directory path, or null if it cannot be created - */ - protected function ensureCacheDir(string $dir): ?string - { - if (!is_dir($dir)) { - if (!@mkdir($dir, 0755, true) && !is_dir($dir)) { - return null; - } - } - - return $dir; - } - - /** - * Gets the cached path for a URL if it exists and is valid. - * - * @param string $url the URL to check - * - * @return null|string the cached file path, or null if not cached or invalid - */ - protected function getCachedPath(string $url): ?string - { - $fsum = hash('sha256', $url); - $metaPattern = $this->resolvedCacheDir.\DIRECTORY_SEPARATOR.$fsum.'.*.meta'; - $metaFiles = glob($metaPattern); - - if (empty($metaFiles)) { - return null; - } - - $latestMeta = null; - $latestTime = 0; - foreach ($metaFiles as $metaFile) { - $content = file_get_contents($metaFile); - if (false === $content) { - continue; - } - - $meta = json_decode($content, true); - if (null === $meta) { - continue; - } - - $creationTime = $meta['creation_time'] ?? 0; - if ($creationTime > $latestTime) { - $latestTime = $creationTime; - $latestMeta = $meta; - } - } - - if (null === $latestMeta || !isset($latestMeta['resource_path'])) { - return null; - } - - $resourcePath = $latestMeta['resource_path']; - if (!file_exists($resourcePath)) { - return null; - } - - // Trust the cache if file exists and metadata is valid - // The etag-based filename (fsum.esum) provides uniqueness - return $resourcePath; - } - - /** - * Attempt to load both tokenizer.json and tokenizer_config.json from a bundled cache entry. - * - * @return null|array - */ - protected function loadFromBundleCache(string $tokenizerUrl, string $tokenizerConfigUrl): ?array - { - if (null === $this->resolvedCacheDir) { - return null; - } - - $bundleKey = hash('sha256', $tokenizerUrl.'|'.$tokenizerConfigUrl); - $metaPath = $this->resolvedCacheDir.\DIRECTORY_SEPARATOR.$bundleKey.'.bundle.meta'; - - if (!file_exists($metaPath)) { - return null; - } - - $metaContent = file_get_contents($metaPath); - if (false === $metaContent) { - return null; - } - - $meta = json_decode($metaContent, true); - if (!\is_array($meta)) { - return null; - } - - $tokPath = $meta['tokenizer_path'] ?? null; - $tokCfgPath = $meta['tokenizer_config_path'] ?? null; - - if (!$tokPath || !file_exists($tokPath)) { - return null; - } - - $tokenizerContent = file_get_contents($tokPath); - if (false === $tokenizerContent) { - return null; - } - - $tokenizerJson = json_decode($tokenizerContent, true); - if (\JSON_ERROR_NONE !== json_last_error()) { - return null; - } - - $tokenizerConfig = null; - if ($tokCfgPath && file_exists($tokCfgPath)) { - $tokenizerConfigContent = file_get_contents($tokCfgPath); - if (false === $tokenizerConfigContent) { - return null; - } - - $tokenizerConfig = json_decode($tokenizerConfigContent, true); - if (\JSON_ERROR_NONE !== json_last_error()) { - $tokenizerConfig = null; - } - } - - return $this->mergeConfigs($tokenizerJson, $tokenizerConfig); - } - - /** - * Download a JSON resource, using cache when available. - * - * @param bool $optional treat 404 as optional - * - * @return array{0: null|array, 1: null|string, 2: null|string} [json, path, etag] - * - * @throws \Exception - */ - protected function downloadJson(string $url, string $label, string $source, bool $optional = false): array - { - // Try cache for this resource - $cachedPath = $this->resolvedCacheDir ? $this->getCachedPath($url) : null; - if (null !== $cachedPath && file_exists($cachedPath)) { - $cachedContent = file_get_contents($cachedPath); - if (false === $cachedContent) { - return [null, null, null]; - } - - $json = json_decode($cachedContent, true); - if (\JSON_ERROR_NONE === json_last_error()) { - return [$json, $cachedPath, null]; - } - } - - $request = $this->requestFactory->createRequest('GET', $url) - ->withHeader('User-Agent', 'tokenizers/'.self::TOKENIZERS_VERSION.'; PHP') - ; - - if ($this->token) { - $request = $request->withHeader('Authorization', "Bearer {$this->token}"); - } - - try { - $response = $this->sendRequest($request); - } catch (\Exception $e) { - throw new \Exception("Failed to load {$label} from Hub for model {$source}: ".$e->getMessage(), 0, $e); - } - - $status = $response->getStatusCode(); - if ($optional && 404 === $status) { - return [null, null, null]; - } - - $content = (string) $response->getBody(); - if (200 !== $status) { - throw new \Exception("Failed to load {$label} from Hub for model {$source}: ".$content); - } - - $json = json_decode($content, true); - if (\JSON_ERROR_NONE !== json_last_error()) { - throw new \Exception("Invalid JSON in {$label} from {$source}: ".json_last_error_msg()); - } - - $etag = $response->getHeaderLine('ETag') ?: null; - $path = null; - - if (null !== $this->resolvedCacheDir && null !== $etag) { - $path = $this->cacheResponse($url, $content, $etag); - } - - return [$json, $path, $etag]; - } - - /** - * Cache a response with a provided ETag, returning the resource path. - */ - protected function cacheResponse(string $url, string $content, string $etag): ?string - { - $fsum = hash('sha256', $url); - $esum = hash('sha256', $etag); - $resourcePath = $this->resolvedCacheDir.\DIRECTORY_SEPARATOR.$fsum.'.'.$esum; - $metaPath = $resourcePath.'.meta'; - $lockPath = $resourcePath.'.lock'; - - file_put_contents($lockPath, ''); - - try { - file_put_contents($resourcePath, $content); + $repo->snapshot( + allowPatterns: self::TOKENIZER_FILES, + force: false + ); - $meta = [ - 'resource' => $url, - 'resource_path' => $resourcePath, - 'meta_path' => $metaPath, - 'etag' => $etag, - 'expires' => null, - 'creation_time' => microtime(true), - ]; + $tokenizer = $repo->download('tokenizer.json')->json(); - file_put_contents($metaPath, json_encode($meta, \JSON_PRETTY_PRINT)); - } finally { - if (file_exists($lockPath)) { - unlink($lockPath); - } + $tokenizerConfig = []; + if ($repo->fileExists('tokenizer_config.json')) { + $tokenizerConfig = $repo->download('tokenizer_config.json')->json(); } - return $resourcePath; - } - - /** - * Cache a bundle (tokenizer.json + tokenizer_config.json) into a single meta file. - */ - protected function cacheBundle( - string $tokenizerUrl, - ?string $tokenizerPath, - ?string $tokenizerEtag, - string $tokenizerConfigUrl, - ?string $tokenizerConfigPath, - ?string $tokenizerConfigEtag - ): void { - if (null === $this->resolvedCacheDir) { - return; - } - - $bundleKey = hash('sha256', $tokenizerUrl.'|'.$tokenizerConfigUrl); - $metaPath = $this->resolvedCacheDir.\DIRECTORY_SEPARATOR.$bundleKey.'.bundle.meta'; - - $meta = [ - 'tokenizer_url' => $tokenizerUrl, - 'tokenizer_path' => $tokenizerPath, - 'tokenizer_etag' => $tokenizerEtag, - 'tokenizer_config_url' => $tokenizerConfigUrl, - 'tokenizer_config_path' => $tokenizerConfigPath, - 'tokenizer_config_etag' => $tokenizerConfigEtag, - 'creation_time' => microtime(true), - ]; - - file_put_contents($metaPath, json_encode($meta, \JSON_PRETTY_PRINT)); - } - - /** - * Merge tokenizer.json with tokenizer_config.json (config wins). - * - * @param null|array $tokenizer - * @param null|array $tokenizerConfig - * - * @return array - */ - protected function mergeConfigs(?array $tokenizer, ?array $tokenizerConfig): array - { - $tokenizer ??= []; - $tokenizerConfig ??= []; - return array_merge($tokenizer, $tokenizerConfig); } } From 552a24d62d92838ef7d6f8616a625aa83eaeee72 Mon Sep 17 00:00:00 2001 From: Kyrian Obikwelu Date: Wed, 4 Feb 2026 12:11:07 +0100 Subject: [PATCH 2/2] chore: code style fixes --- examples/tokenization_overview.php | 12 +++++------- src/Loaders/HubLoader.php | 3 ++- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/examples/tokenization_overview.php b/examples/tokenization_overview.php index 51fa527..daded47 100644 --- a/examples/tokenization_overview.php +++ b/examples/tokenization_overview.php @@ -16,17 +16,16 @@ * Run with: * php examples/tokenization_overview.php */ - $models = [ 'BERT (uncased)' => 'google-bert/bert-base-uncased', - 'GPT-2' => 'openai-community/gpt2', - 'Qwen3 Embedding' => 'Qwen/Qwen3-Embedding-0.6B', + 'GPT-2' => 'openai-community/gpt2', + 'Qwen3 Embedding' => 'Qwen/Qwen3-Embedding-0.6B', ]; $samples = [ 'Short sentence' => 'Hello, how are you doing today?', - 'Code snippet' => 'function sum(int $a, int $b): int { return $a + $b; }', - 'Mixed content' => 'Paris is the capital of France. 42 🧠', + 'Code snippet' => 'function sum(int $a, int $b): int { return $a + $b; }', + 'Mixed content' => 'Paris is the capital of France. 42 🧠', ]; echo "=== Tokenizers PHP - Tokenization Overview ===\n\n"; @@ -43,7 +42,7 @@ $ids = $encoding->ids; $tokens = $encoding->tokens; - $count = \count($ids); + $count = count($ids); $idsPreview = implode(', ', array_slice($ids, 0, 10)); $tokensPreview = implode(' ', array_slice($tokens, 0, 10)); @@ -56,4 +55,3 @@ echo str_repeat('-', 60)."\n\n"; } - diff --git a/src/Loaders/HubLoader.php b/src/Loaders/HubLoader.php index 4529fe4..bfd20fc 100644 --- a/src/Loaders/HubLoader.php +++ b/src/Loaders/HubLoader.php @@ -42,7 +42,8 @@ public function load(string ...$source): array $repo = $hf->hub() ->repo($modelId) - ->revision($this->revision); + ->revision($this->revision) + ; $repo->snapshot( allowPatterns: self::TOKENIZER_FILES,