From 38aebcee3e0280ae7a572e2d3af0eb15e1036920 Mon Sep 17 00:00:00 2001
From: Kyrian Obikwelu <koshnawaza@gmail.com>
Date: Wed, 4 Feb 2026 11:10:39 +0100
Subject: [PATCH 1/2] feat: use Hugging Face PHP for Hub loading
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Replace custom HubLoader HTTP and cache logic with huggingface-php RepoManager
- Use snapshot()+manifest to cache tokenizer.json and tokenizer_config.json efficiently
- Remove TOKENIZERS_CACHE handling and custom cache-dir resolution in favor of standard HF cache
- Simplify README highlights and cache documentation to match new behavior
- Trim old, noisy examples and add a single tokenization_overview.php sample
- Update composer requirements to depend on codewithkyrian/huggingface

BREAKING CHANGE:
- TOKENIZERS_CACHE is no longer read; use cacheDir or HF_HUB_CACHE/HF_HOME instead.
- Hub loading behavior and cache layout now follow huggingface-php’s unified cache.
---
 README.md                                     |  21 +-
 composer.json                                 |   6 +-
 examples/context_window_fit_analysis.php      | 229 ---------
 examples/document_chunking_pipeline.php       | 247 ---------
 examples/semantic_search_embeddings.php       |  93 ----
 .../text_classification_preprocessing.php     | 236 ---------
 examples/tokenization_overview.php            |  59 +++
 src/Loaders/HubLoader.php                     | 484 +-----------------
 8 files changed, 96 insertions(+), 1279 deletions(-)
 delete mode 100644 examples/context_window_fit_analysis.php
 delete mode 100644 examples/document_chunking_pipeline.php
 delete mode 100644 examples/semantic_search_embeddings.php
 delete mode 100644 examples/text_classification_preprocessing.php
 create mode 100644 examples/tokenization_overview.php

diff --git a/README.md b/README.md
index 7675fbd..84f40c4 100644
--- a/README.md
+++ b/README.md
@@ -15,8 +15,8 @@
 ## Highlights
 
 - **Pure PHP** — No FFI, no external binaries, no compiled extensions. Works everywhere PHP runs.
-- **Zero Hard Dependencies** — Core tokenization has no required dependencies. Optional HTTP client needed only for Hub downloads.
-- **Hub Compatible** — Load tokenizers directly from Hugging Face Hub or from local files.
+- **Hub Integration** — Load tokenizers from Hugging Face Hub with smart caching and manifest-based file checks.
+- **Flexible Loading** — Load from local files, config arrays, or build custom tokenizers with the builder API.
 - **Fully Tested** — Validated against BERT, GPT-2, Llama, Gemma, Qwen, RoBERTa, ALBERT, and more.
 - **Modern PHP** — Built for PHP 8.2+ with strict types, readonly classes, and clean interfaces.
 
@@ -28,16 +28,14 @@ Install via Composer:
 composer require codewithkyrian/tokenizers
 ```
 
-### HTTP Client (Optional)
+### HTTP Client (for Hub loading)
 
-If you plan to load tokenizers from the Hugging Face Hub, you'll need an HTTP client implementing PSR-18. We recommend Guzzle:
+Loading tokenizers from the Hugging Face Hub requires an HTTP client. We recommend Guzzle:
 
 ```bash
 composer require guzzlehttp/guzzle
 ```
 
-> **Note:** The library uses [PHP-HTTP Discovery](https://github.com/php-http/discovery) to automatically find and use any PSR-18 compatible HTTP client installed in your project. If you're only loading tokenizers from local files, no HTTP client is needed.
-
 ## Quick Start
 
 ```php
@@ -96,10 +94,13 @@ $tokenizer = Tokenizer::fromHub(
 
 When `cacheDir` is not specified, the library automatically resolves the cache location:
 
-1. **Environment Variable** — `TOKENIZERS_CACHE` if set
-2. **macOS** — `~/Library/Caches/huggingface/tokenizers`
-3. **Linux** — `$XDG_CACHE_HOME/huggingface/tokenizers` or `~/.cache/huggingface/tokenizers`
-4. **Windows** — `%LOCALAPPDATA%\huggingface\tokenizers`
+1. **HF_HUB_CACHE** — if set, used directly
+2. **HF_HOME** — if set, `$HF_HOME/hub`
+3. **macOS** — `~/Library/Caches/huggingface/hub`
+4. **Linux** — `$XDG_CACHE_HOME/huggingface/hub` or `~/.cache/huggingface/hub`
+5. **Windows** — `%LOCALAPPDATA%\huggingface\hub`
+
+Pass `cacheDir` to use a custom directory.
 
 ### From Local Files
 
diff --git a/composer.json b/composer.json
index 457685d..d2ac0ca 100644
--- a/composer.json
+++ b/composer.json
@@ -32,9 +32,7 @@
     },
     "require": {
         "php": "^8.2",
-        "psr/http-client": "^1.0",
-        "psr/http-factory": "^1.0",
-        "php-http/discovery": "^1.19"
+        "codewithkyrian/huggingface": "^1.0"
     },
     "require-dev": {
         "friendsofphp/php-cs-fixer": "^3.91",
@@ -56,4 +54,4 @@
         "cs:check": "vendor/bin/php-cs-fixer fix --dry-run --diff",
         "analyse": "vendor/bin/phpstan analyse -c phpstan.dist.neon"
     }
-}
\ No newline at end of file
+}
diff --git a/examples/context_window_fit_analysis.php b/examples/context_window_fit_analysis.php
deleted file mode 100644
index e5f6101..0000000
--- a/examples/context_window_fit_analysis.php
+++ /dev/null
@@ -1,229 +0,0 @@
-<?php
-
-declare(strict_types=1);
-
-/**
- * Context Window Fit Analysis.
- *
- * This example demonstrates how to analyze whether a conversation fits within
- * different LLM context windows. Each model family has its own:
- * - Tokenizer (vocabulary and encoding rules)
- * - Chat template format (how messages are structured)
- * - Context window limits (available via $tokenizer->modelMaxLength)
- *
- * NOTE: Chat templates are typically Jinja templates stored in the tokenizer config.
- * You can install `codewithkyrian/jinja-php` to parse and render them dynamically:
- *
- *     composer require codewithkyrian/jinja-php
- *
- * Here we manually implement each model's chat template format.
- */
-
-require __DIR__.'/../vendor/autoload.php';
-
-use Codewithkyrian\Tokenizers\Tokenizer;
-
-/**
- * Chat template implementations for different model families.
- * Each model uses a different format for structuring conversations.
- */
-
-/**
- * ChatML format (Qwen, Yi, GPT-4o, etc.)
- * Template: <|im_start|>role\ncontent<|im_end|>.
- */
-function applyChatMLTemplate(array $messages, bool $addGenerationPrompt = true): string
-{
-    $formatted = '';
-
-    foreach ($messages as $message) {
-        $formatted .= "<|im_start|>{$message['role']}\n{$message['content']}<|im_end|>\n";
-    }
-
-    if ($addGenerationPrompt) {
-        $formatted .= "<|im_start|>assistant\n";
-    }
-
-    return $formatted;
-}
-
-/**
- * Llama 3 format
- * Template: <|start_header_id|>role<|end_header_id|>\n\ncontent<|eot_id|>.
- */
-function applyLlama3Template(array $messages, bool $addGenerationPrompt = true): string
-{
-    $formatted = '<|begin_of_text|>';
-
-    foreach ($messages as $message) {
-        $formatted .= "<|start_header_id|>{$message['role']}<|end_header_id|>\n\n";
-        $formatted .= "{$message['content']}<|eot_id|>";
-    }
-
-    if ($addGenerationPrompt) {
-        $formatted .= "<|start_header_id|>assistant<|end_header_id|>\n\n";
-    }
-
-    return $formatted;
-}
-
-/**
- * Claude format (Anthropic)
- * Template: \n\nHuman: content\n\nAssistant: content.
- */
-function applyClaudeTemplate(array $messages, bool $addGenerationPrompt = true): string
-{
-    $formatted = '';
-
-    foreach ($messages as $message) {
-        $role = match ($message['role']) {
-            'system' => 'Human', // Claude handles system via system parameter, but we include it here
-            'user' => 'Human',
-            'assistant' => 'Assistant',
-            default => $message['role'],
-        };
-
-        // System messages are typically prepended to the first human message
-        if ('system' === $message['role']) {
-            $formatted .= "{$message['content']}\n\n";
-
-            continue;
-        }
-
-        $formatted .= "\n\n{$role}: {$message['content']}";
-    }
-
-    if ($addGenerationPrompt) {
-        $formatted .= "\n\nAssistant:";
-    }
-
-    return trim($formatted);
-}
-
-$models = [
-    'Qwen2-1.5B-Instruct' => [
-        'hub_id' => 'Qwen/Qwen2-1.5B-Instruct',
-        'template_fn' => 'applyChatMLTemplate',
-    ],
-    'Llama-3' => [
-        'hub_id' => 'Xenova/llama3-tokenizer',
-        'template_fn' => 'applyLlama3Template',
-        'context_window' => 8192,
-    ],
-    'GPT-4o' => [
-        'hub_id' => 'Xenova/gpt-4o',
-        'template_fn' => 'applyChatMLTemplate',
-    ],
-    'Claude-Sonnet-4' => [
-        'hub_id' => 'Xenova/claude-tokenizer',
-        'template_fn' => 'applyClaudeTemplate',
-    ],
-    'Grok-1' => [
-        'hub_id' => 'Xenova/grok-1-tokenizer',
-        'template_fn' => 'applyChatMLTemplate',
-    ],
-    'DeepSeek-V3.2' => [
-        'hub_id' => 'deepseek-ai/DeepSeek-V3.2',
-        'template_fn' => 'applyChatMLTemplate',
-    ],
-];
-
-$conversation = [
-    [
-        'role' => 'system',
-        'content' => 'You are a helpful customer support assistant for TechCorp. Be concise, professional, and always offer to escalate complex issues to a human agent.',
-    ],
-    [
-        'role' => 'user',
-        'content' => 'Hi, I purchased a laptop last week and the screen keeps flickering. I\'ve tried restarting it multiple times but the issue persists.',
-    ],
-    [
-        'role' => 'assistant',
-        'content' => 'I\'m sorry to hear about the screen flickering issue with your new laptop. This could be caused by a few things - a driver issue, loose display cable, or a hardware defect. Let me help you troubleshoot. First, could you tell me the laptop model and whether you\'ve updated the graphics drivers recently?',
-    ],
-    [
-        'role' => 'user',
-        'content' => 'It\'s the TechCorp ProBook 15. I haven\'t updated any drivers since I got it. The flickering happens randomly, sometimes every few minutes, sometimes it doesn\'t happen for hours.',
-    ],
-    [
-        'role' => 'assistant',
-        'content' => 'Thank you for those details. The intermittent nature suggests it might be a driver or software issue rather than hardware. Let\'s try updating your graphics drivers first. Go to Settings > Windows Update > Check for updates, and also check for optional driver updates. If that doesn\'t resolve it within 24 hours, we can arrange a diagnostic or replacement under your warranty.',
-    ],
-    [
-        'role' => 'user',
-        'content' => 'Okay, I\'ll try that. If it doesn\'t work, how do I arrange the diagnostic? And will I lose my data?',
-    ],
-];
-
-echo "=== Context Window Fit Analysis ===\n\n";
-echo "Analyzing the same conversation across different LLM tokenizers.\n";
-echo 'Messages: '.count($conversation)."\n\n";
-
-$results = [];
-
-foreach ($models as $modelName => $config) {
-    echo "Loading {$modelName}...\n";
-
-    $tokenizer = Tokenizer::fromHub($config['hub_id']);
-    $templateFn = $config['template_fn'];
-    $formattedPrompt = $templateFn($conversation);
-    $encoding = $tokenizer->encode($formattedPrompt);
-    $tokenCount = count($encoding->ids);
-    $contextWindow = $config['context_window'] ?? $tokenizer->modelMaxLength;
-
-    $results[$modelName] = [
-        'tokens' => $tokenCount,
-        'context_window' => $contextWindow,
-        'percent_used' => $contextWindow ? round(($tokenCount / $contextWindow) * 100, 2) : null,
-        'remaining' => $contextWindow ? $contextWindow - $tokenCount : null,
-        'fits' => $contextWindow ? $tokenCount <= $contextWindow : null,
-    ];
-}
-
-echo "\n".str_repeat('=', 85)."\n";
-echo "RESULTS\n";
-echo str_repeat('=', 85)."\n\n";
-
-// Header
-printf(
-    "%-20s │ %10s │ %15s │ %8s │ %15s │ %s\n",
-    'Model',
-    'Tokens',
-    'Context Window',
-    'Used %',
-    'Remaining',
-    'Status'
-);
-echo str_repeat('─', 85)."\n";
-
-foreach ($results as $modelName => $data) {
-    $contextStr = null !== $data['context_window']
-        ? number_format($data['context_window'])
-        : 'N/A';
-
-    $percentStr = null !== $data['percent_used']
-        ? $data['percent_used'].'%'
-        : 'N/A';
-
-    $remainingStr = null !== $data['remaining']
-        ? number_format($data['remaining'])
-        : 'N/A';
-
-    $status = match (true) {
-        null === $data['fits'] => '? Unknown',
-        $data['fits'] => '✓ Fits',
-        default => '✗ Exceeds',
-    };
-
-    printf(
-        "%-20s │ %10s │ %15s │ %8s │ %15s │ %s\n",
-        $modelName,
-        number_format($data['tokens']),
-        $contextStr,
-        $percentStr,
-        $remainingStr,
-        $status
-    );
-}
-
-echo str_repeat('─', 85)."\n\n";
diff --git a/examples/document_chunking_pipeline.php b/examples/document_chunking_pipeline.php
deleted file mode 100644
index 9abde6a..0000000
--- a/examples/document_chunking_pipeline.php
+++ /dev/null
@@ -1,247 +0,0 @@
-<?php
-
-declare(strict_types=1);
-
-/**
- * Document Chunking Pipeline.
- *
- * This example demonstrates how to split long documents into token-aware chunks
- * for processing with models that have context length limits. Common use cases:
- * - Splitting long articles for embedding and indexing
- * - Preparing documents for summarization
- * - Breaking down PDFs/documents for RAG pipelines
- * - Processing large text files for analysis
- *
- * The chunking respects sentence boundaries to maintain semantic coherence.
- */
-
-require __DIR__.'/../vendor/autoload.php';
-
-use Codewithkyrian\Tokenizers\Tokenizer;
-
-// A long document that needs to be chunked (simulating content from a PDF or article)
-$longDocument = <<<'DOCUMENT'
-The History and Evolution of Artificial Intelligence
-
-Artificial intelligence has a rich history spanning over seven decades. The field was officially born at the Dartmouth Conference in 1956, where John McCarthy, Marvin Minsky, Nathaniel Rochester, and Claude Shannon proposed that "every aspect of learning or any other feature of intelligence can in principle be so precisely described that a machine can be made to simulate it."
-
-The early years of AI research were characterized by tremendous optimism. Researchers developed programs that could prove mathematical theorems, play chess, and solve algebra problems. The General Problem Solver, developed by Herbert Simon and Allen Newell in 1957, was one of the first programs designed to mimic human problem-solving processes.
-
-However, the field soon encountered significant challenges. The limitations of early computing hardware, combined with the complexity of real-world problems, led to the first "AI Winter" in the 1970s. Funding dried up as the promised breakthroughs failed to materialize.
-
-The 1980s saw a resurgence with the development of expert systems. These programs encoded human expertise in specific domains and found commercial applications in medicine, finance, and manufacturing. Companies invested heavily in AI, leading to a second wave of enthusiasm.
-
-This enthusiasm was short-lived. Expert systems proved brittle and expensive to maintain, leading to another AI Winter in the late 1980s and early 1990s. Many researchers abandoned the field, and AI became almost a taboo term in academic circles.
-
-The modern AI renaissance began in the 2010s, driven by three key factors: the availability of large datasets, improvements in computing power (particularly GPUs), and advances in deep learning algorithms. The ImageNet competition in 2012 marked a turning point when a deep neural network dramatically outperformed traditional computer vision approaches.
-
-Since then, AI has achieved remarkable milestones. In 2016, DeepMind's AlphaGo defeated the world champion in Go, a game long considered too complex for machines. Natural language processing has been transformed by transformer architectures, leading to large language models that can engage in sophisticated conversations, write code, and assist with complex reasoning tasks.
-
-Today, AI is integrated into countless applications: recommendation systems, autonomous vehicles, medical diagnosis, scientific research, and creative tools. The technology continues to advance rapidly, raising both exciting possibilities and important ethical questions about its role in society.
-
-As we look to the future, researchers are exploring new frontiers: artificial general intelligence, neuromorphic computing, and hybrid systems that combine neural networks with symbolic reasoning. The field that began as an academic curiosity has become one of the most transformative technologies of our time.
-
-The journey of AI reminds us that breakthrough technologies often follow a pattern of hype, disappointment, and eventual realization - sometimes taking decades longer than initially predicted. As we stand on the cusp of potentially even greater advances, this history provides valuable lessons about managing expectations while continuing to push the boundaries of what machines can achieve.
-DOCUMENT;
-
-// Configuration for different model context windows
-$chunkConfigs = [
-    'embedding_model' => [
-        'max_tokens' => 256,
-        'overlap_tokens' => 50,
-        'description' => 'For sentence-transformer embeddings',
-    ],
-    'summarization_model' => [
-        'max_tokens' => 512,
-        'overlap_tokens' => 100,
-        'description' => 'For T5/BART summarization',
-    ],
-    'llm_context' => [
-        'max_tokens' => 1024,
-        'overlap_tokens' => 200,
-        'description' => 'For LLM context windows',
-    ],
-];
-
-echo "=== Document Chunking Pipeline Example ===\n\n";
-
-// Load tokenizer
-$tokenizer = Tokenizer::fromHub('bert-base-uncased');
-
-echo "Tokenizer loaded: bert-base-uncased\n\n";
-
-// Analyze the full document first
-$fullEncoding = $tokenizer->encode($longDocument);
-$totalTokens = count($fullEncoding->ids);
-
-echo "--- Document Analysis ---\n\n";
-echo 'Total characters: '.mb_strlen($longDocument)."\n";
-echo "Total tokens: {$totalTokens}\n";
-echo 'Average tokens per character: '.round($totalTokens / mb_strlen($longDocument), 3)."\n\n";
-
-/**
- * Split text into sentences (simple implementation).
- */
-function splitIntoSentences(string $text): array
-{
-    // Split on sentence-ending punctuation followed by space or end
-    $sentences = preg_split('/(?<=[.!?])\s+/', $text, -1, \PREG_SPLIT_NO_EMPTY);
-
-    return array_map('trim', $sentences);
-}
-
-/**
- * Create token-aware chunks from a document.
- *
- * @param int $maxTokens     Maximum tokens per chunk (excluding special tokens)
- * @param int $overlapTokens Number of tokens to overlap between chunks
- *
- * @return array{chunks: array, metadata: array}
- */
-function createChunks(Tokenizer $tokenizer, string $text, int $maxTokens, int $overlapTokens): array
-{
-    $sentences = splitIntoSentences($text);
-    $chunks = [];
-    $metadata = [];
-
-    $currentChunk = [];
-    $currentTokenCount = 0;
-    $chunkStartIndex = 0;
-
-    foreach ($sentences as $sentenceIndex => $sentence) {
-        $sentenceEncoding = $tokenizer->encode($sentence, addSpecialTokens: false);
-        $sentenceTokens = count($sentenceEncoding->ids);
-
-        // If single sentence exceeds max, we need to split it (edge case)
-        if ($sentenceTokens > $maxTokens) {
-            // Flush current chunk first
-            if (!empty($currentChunk)) {
-                $chunkText = implode(' ', $currentChunk);
-                $chunks[] = $chunkText;
-                $metadata[] = [
-                    'chunk_index' => count($chunks) - 1,
-                    'sentence_range' => [$chunkStartIndex, $sentenceIndex - 1],
-                    'token_count' => $currentTokenCount,
-                ];
-            }
-
-            // Add the long sentence as its own chunk (will be truncated by model)
-            $chunks[] = $sentence;
-            $metadata[] = [
-                'chunk_index' => count($chunks) - 1,
-                'sentence_range' => [$sentenceIndex, $sentenceIndex],
-                'token_count' => $sentenceTokens,
-                'warning' => 'Sentence exceeds max tokens, may be truncated',
-            ];
-
-            $currentChunk = [];
-            $currentTokenCount = 0;
-            $chunkStartIndex = $sentenceIndex + 1;
-
-            continue;
-        }
-
-        // Check if adding this sentence would exceed the limit
-        if ($currentTokenCount + $sentenceTokens > $maxTokens && !empty($currentChunk)) {
-            // Save current chunk
-            $chunkText = implode(' ', $currentChunk);
-            $chunks[] = $chunkText;
-            $metadata[] = [
-                'chunk_index' => count($chunks) - 1,
-                'sentence_range' => [$chunkStartIndex, $sentenceIndex - 1],
-                'token_count' => $currentTokenCount,
-            ];
-
-            // Calculate overlap: include last N tokens worth of sentences
-            $overlapSentences = [];
-            $overlapCount = 0;
-            for ($i = count($currentChunk) - 1; $i >= 0 && $overlapCount < $overlapTokens; --$i) {
-                $sentEnc = $tokenizer->encode($currentChunk[$i], addSpecialTokens: false);
-                $overlapCount += count($sentEnc->ids);
-                array_unshift($overlapSentences, $currentChunk[$i]);
-            }
-
-            $currentChunk = $overlapSentences;
-            $currentTokenCount = $overlapCount;
-            $chunkStartIndex = $sentenceIndex - count($overlapSentences);
-        }
-
-        $currentChunk[] = $sentence;
-        $currentTokenCount += $sentenceTokens;
-    }
-
-    // Don't forget the last chunk
-    if (!empty($currentChunk)) {
-        $chunkText = implode(' ', $currentChunk);
-        $chunks[] = $chunkText;
-        $metadata[] = [
-            'chunk_index' => count($chunks) - 1,
-            'sentence_range' => [$chunkStartIndex, count($sentences) - 1],
-            'token_count' => $currentTokenCount,
-        ];
-    }
-
-    return ['chunks' => $chunks, 'metadata' => $metadata];
-}
-
-// Process document with different configurations
-foreach ($chunkConfigs as $configName => $config) {
-    echo "--- Chunking for: {$config['description']} ---\n";
-    echo "Max tokens: {$config['max_tokens']}, Overlap: {$config['overlap_tokens']}\n\n";
-
-    $result = createChunks(
-        $tokenizer,
-        $longDocument,
-        $config['max_tokens'],
-        $config['overlap_tokens']
-    );
-
-    echo 'Created '.count($result['chunks'])." chunks:\n\n";
-
-    foreach ($result['chunks'] as $index => $chunk) {
-        $meta = $result['metadata'][$index];
-        $preview = mb_substr($chunk, 0, 80);
-
-        echo 'Chunk '.($index + 1).":\n";
-        echo "  Tokens: {$meta['token_count']}\n";
-        echo "  Sentences: {$meta['sentence_range'][0]}-{$meta['sentence_range'][1]}\n";
-        echo "  Preview: \"{$preview}...\"\n";
-
-        if (isset($meta['warning'])) {
-            echo "  ⚠️ {$meta['warning']}\n";
-        }
-        echo "\n";
-    }
-
-    echo str_repeat('-', 60)."\n\n";
-}
-
-// Demonstrate verification: all text should be recoverable from chunks (minus overlaps)
-echo "--- Chunk Verification ---\n\n";
-
-$config = $chunkConfigs['embedding_model'];
-$result = createChunks($tokenizer, $longDocument, $config['max_tokens'], $config['overlap_tokens']);
-
-// Verify each chunk encodes correctly
-$allValid = true;
-foreach ($result['chunks'] as $index => $chunk) {
-    $encoding = $tokenizer->encode($chunk);
-    $decoded = $tokenizer->decode($encoding->ids, skipSpecialTokens: true);
-
-    // Simple check: decoded text should contain key words from original
-    $originalWords = array_slice(explode(' ', $chunk), 0, 5);
-    $decodedWords = explode(' ', $decoded);
-
-    $matchCount = count(array_filter($originalWords, static fn ($w) => in_array(strtolower($w), array_map('strtolower', $decodedWords))));
-
-    if ($matchCount < 3) {
-        echo '⚠️ Chunk '.($index + 1)." may have encoding issues\n";
-        $allValid = false;
-    }
-}
-
-if ($allValid) {
-    echo "✓ All chunks verified - encode/decode roundtrip successful\n";
-}
-
-echo "\nPipeline complete! Chunks are ready for embedding or processing.\n";
diff --git a/examples/semantic_search_embeddings.php b/examples/semantic_search_embeddings.php
deleted file mode 100644
index 220f9a9..0000000
--- a/examples/semantic_search_embeddings.php
+++ /dev/null
@@ -1,93 +0,0 @@
-<?php
-
-declare(strict_types=1);
-
-/**
- * Semantic Search & Embedding Preparation.
- *
- * This example shows how to prepare text for semantic search or vector embedding
- * pipelines. Common use cases include:
- * - Building searchable document indexes
- * - Generating embeddings for similarity matching
- * - Preparing queries and documents for retrieval-augmented generation (RAG)
- *
- * We use the all-MiniLM model which is popular for sentence embeddings.
- */
-
-require __DIR__.'/../vendor/autoload.php';
-
-use Codewithkyrian\Tokenizers\Tokenizer;
-
-$documents = [
-    [
-        'id' => 'doc_001',
-        'title' => 'Introduction to Machine Learning',
-        'content' => 'Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed. It focuses on developing algorithms that can access data and use it to learn for themselves.',
-    ],
-    [
-        'id' => 'doc_002',
-        'title' => 'Neural Networks Explained',
-        'content' => 'Neural networks are computing systems inspired by biological neural networks in the human brain. They consist of interconnected nodes organized in layers that process information using connectionist approaches to computation.',
-    ],
-    [
-        'id' => 'doc_003',
-        'title' => 'Natural Language Processing',
-        'content' => 'NLP combines computational linguistics with statistical, machine learning, and deep learning models. It enables computers to process and analyze large amounts of natural language data, from text classification to machine translation.',
-    ],
-    [
-        'id' => 'doc_004',
-        'title' => 'Computer Vision Applications',
-        'content' => 'Computer vision trains machines to interpret and understand visual information from the world. Applications range from facial recognition and autonomous vehicles to medical image analysis and industrial quality control.',
-    ],
-];
-
-$searchQueries = [
-    'How do computers learn from data?',
-    'What are the layers in AI systems?',
-    'Processing human language with AI',
-];
-
-echo "=== Semantic Search Tokenization Example ===\n\n";
-
-$tokenizer = Tokenizer::fromHub('sentence-transformers/all-MiniLM-L6-v2');
-
-echo "Tokenizer loaded: sentence-transformers/all-MiniLM-L6-v2\n\n";
-
-echo "--- Processing Documents ---\n\n";
-
-$maxTokenLength = 256;
-
-foreach ($documents as $doc) {
-    $text = $doc['title'].'. '.$doc['content'];
-    $encoding = $tokenizer->encode($text);
-
-    $tokenCount = count($encoding->ids);
-    $truncated = $tokenCount > $maxTokenLength;
-
-    echo "Document: {$doc['id']}\n";
-    echo "  Title: {$doc['title']}\n";
-    echo "  Token count: {$tokenCount}".($truncated ? ' (would need truncation)' : '')."\n";
-    echo '  First 10 tokens: '.implode(', ', array_slice($encoding->tokens, 0, 10))."...\n";
-    echo '  First 10 IDs: ['.implode(', ', array_slice($encoding->ids, 0, 10))."...]\n\n";
-}
-
-echo "--- Processing Search Queries ---\n\n";
-
-foreach ($searchQueries as $index => $query) {
-    $encoding = $tokenizer->encode($query);
-
-    echo 'Query '.($index + 1).": \"{$query}\"\n";
-    echo '  Tokens: '.implode(', ', $encoding->tokens)."\n";
-    echo '  IDs: ['.implode(', ', $encoding->ids)."]\n";
-    echo '  Token count: '.count($encoding->ids)."\n\n";
-}
-
-echo "--- Round-trip Verification ---\n\n";
-
-$testText = 'Machine learning enables pattern recognition in data.';
-$encoding = $tokenizer->encode($testText);
-$decoded = $tokenizer->decode($encoding->ids);
-
-echo "Original: {$testText}\n";
-echo 'Encoded: ['.implode(', ', $encoding->ids)."]\n";
-echo "Decoded: {$decoded}\n";
diff --git a/examples/text_classification_preprocessing.php b/examples/text_classification_preprocessing.php
deleted file mode 100644
index a75c423..0000000
--- a/examples/text_classification_preprocessing.php
+++ /dev/null
@@ -1,236 +0,0 @@
-<?php
-
-declare(strict_types=1);
-
-/**
- * Text Classification Preprocessing.
- *
- * This example shows how to prepare text data for classification tasks using
- * BERT-style tokenization. Common use cases include:
- * - Sentiment analysis
- * - Spam detection
- * - Topic classification
- * - Content moderation
- *
- * We demonstrate preprocessing with type IDs for sentence pair tasks like
- * natural language inference (NLI) and question answering.
- */
-
-require __DIR__.'/../vendor/autoload.php';
-
-use Codewithkyrian\Tokenizers\Tokenizer;
-
-// Sample data for sentiment analysis (e.g., product reviews)
-$sentimentSamples = [
-    [
-        'text' => 'Absolutely love this product! The quality exceeded my expectations and shipping was incredibly fast. Will definitely buy again.',
-        'label' => 'positive',
-    ],
-    [
-        'text' => 'Decent product for the price. Does what it\'s supposed to do, nothing more nothing less. Packaging could be better.',
-        'label' => 'neutral',
-    ],
-    [
-        'text' => 'Complete waste of money. Arrived broken, customer service was unhelpful, and the return process took forever. Avoid!',
-        'label' => 'negative',
-    ],
-    [
-        'text' => 'The device worked fine for about two weeks, then suddenly stopped charging. Replacement had the same issue. Very frustrating experience.',
-        'label' => 'negative',
-    ],
-];
-
-// Sample data for Natural Language Inference (premise + hypothesis pairs)
-$nliSamples = [
-    [
-        'premise' => 'A man is playing a guitar on stage in front of a large crowd.',
-        'hypothesis' => 'A musician is performing at a concert.',
-        'label' => 'entailment',
-    ],
-    [
-        'premise' => 'A man is playing a guitar on stage in front of a large crowd.',
-        'hypothesis' => 'A man is sleeping in his bedroom.',
-        'label' => 'contradiction',
-    ],
-    [
-        'premise' => 'A man is playing a guitar on stage in front of a large crowd.',
-        'hypothesis' => 'The man is a professional musician.',
-        'label' => 'neutral',
-    ],
-];
-
-// Sample data for question answering context matching
-$qaSamples = [
-    [
-        'question' => 'What is the capital of France?',
-        'context' => 'France is a country in Western Europe. Its capital city is Paris, which is known for the Eiffel Tower and the Louvre Museum.',
-    ],
-    [
-        'question' => 'When was the company founded?',
-        'context' => 'TechCorp was founded in 2010 by Jane Smith and John Doe. The company started in a small garage in Silicon Valley and has since grown to over 5000 employees worldwide.',
-    ],
-];
-
-echo "=== Text Classification Preprocessing Example ===\n\n";
-
-// Load BERT tokenizer - the standard for many classification tasks
-$tokenizer = Tokenizer::fromHub('bert-base-uncased');
-
-echo "Tokenizer loaded: bert-base-uncased\n";
-echo "Max sequence length for BERT: 512 tokens\n\n";
-
-// ============================================
-// SINGLE SEQUENCE: Sentiment Analysis
-// ============================================
-
-echo "--- Sentiment Analysis (Single Sequence) ---\n\n";
-
-$maxLength = 128; // Typical for classification tasks
-
-foreach ($sentimentSamples as $index => $sample) {
-    $encoding = $tokenizer->encode($sample['text']);
-
-    $tokenCount = count($encoding->ids);
-    $needsPadding = $tokenCount < $maxLength;
-    $needsTruncation = $tokenCount > $maxLength;
-
-    echo 'Sample '.($index + 1)." [{$sample['label']}]:\n";
-    echo '  Text: "'.mb_substr($sample['text'], 0, 60)."...\"\n";
-    echo "  Token count: {$tokenCount}\n";
-
-    // Show BERT's special token structure
-    echo "  Structure: [CLS] ... text tokens ... [SEP]\n";
-    echo '  First 5: '.implode(' ', array_slice($encoding->tokens, 0, 5))."\n";
-    echo '  Last 3: '.implode(' ', array_slice($encoding->tokens, -3))."\n";
-
-    if ($needsPadding) {
-        $paddingNeeded = $maxLength - $tokenCount;
-        echo "  Padding needed: {$paddingNeeded} [PAD] tokens\n";
-    }
-    if ($needsTruncation) {
-        $truncateCount = $tokenCount - $maxLength;
-        echo "  Truncation needed: remove {$truncateCount} tokens\n";
-    }
-
-    echo "\n";
-}
-
-// ============================================
-// SENTENCE PAIRS: Natural Language Inference
-// ============================================
-
-echo "--- Natural Language Inference (Sentence Pairs) ---\n\n";
-
-foreach ($nliSamples as $index => $sample) {
-    // BERT uses textPair for sentence pair tasks
-    $encoding = $tokenizer->encode(
-        text: $sample['premise'],
-        textPair: $sample['hypothesis'],
-        addSpecialTokens: true
-    );
-
-    echo 'Sample '.($index + 1)." [{$sample['label']}]:\n";
-    echo "  Premise: \"{$sample['premise']}\"\n";
-    echo "  Hypothesis: \"{$sample['hypothesis']}\"\n";
-    echo '  Token count: '.count($encoding->ids)."\n";
-
-    // Show the structure with type IDs
-    echo "  Structure: [CLS] premise [SEP] hypothesis [SEP]\n";
-
-    // Type IDs distinguish between premise (0) and hypothesis (1)
-    $segment0Count = count(array_filter($encoding->typeIds, static fn ($t) => 0 === $t));
-    $segment1Count = count(array_filter($encoding->typeIds, static fn ($t) => 1 === $t));
-
-    echo "  Segment A (premise) tokens: {$segment0Count}\n";
-    echo "  Segment B (hypothesis) tokens: {$segment1Count}\n";
-    echo '  Type IDs sample: ['.implode(', ', array_slice($encoding->typeIds, 0, 10))."...]\n\n";
-}
-
-// ============================================
-// QUESTION ANSWERING PAIRS
-// ============================================
-
-echo "--- Question Answering (Question + Context) ---\n\n";
-
-foreach ($qaSamples as $index => $sample) {
-    $encoding = $tokenizer->encode(
-        text: $sample['question'],
-        textPair: $sample['context'],
-        addSpecialTokens: true
-    );
-
-    echo 'Sample '.($index + 1).":\n";
-    echo "  Question: \"{$sample['question']}\"\n";
-    echo '  Context: "'.mb_substr($sample['context'], 0, 80)."...\"\n";
-    echo '  Total tokens: '.count($encoding->ids)."\n";
-
-    // Find where the context starts (after second segment begins)
-    $contextStart = array_search(1, $encoding->typeIds);
-    echo "  Question tokens: {$contextStart}\n";
-    echo '  Context tokens: '.(count($encoding->ids) - $contextStart)."\n\n";
-}
-
-// ============================================
-// BATCH PROCESSING HELPER
-// ============================================
-
-echo "--- Batch Preprocessing Helper ---\n\n";
-
-/**
- * Preprocess a batch of texts for classification.
- * In production, you'd send these to your model.
- *
- * @return array{input_ids: array, attention_mask: array, token_type_ids: array}
- */
-function preprocessBatch(Tokenizer $tokenizer, array $texts, int $maxLength = 128): array
-{
-    $batchInputIds = [];
-    $batchAttentionMask = [];
-    $batchTokenTypeIds = [];
-
-    foreach ($texts as $text) {
-        $encoding = $tokenizer->encode($text);
-
-        $ids = $encoding->ids;
-        $typeIds = $encoding->typeIds;
-
-        // Truncate if needed
-        if (count($ids) > $maxLength) {
-            $ids = array_slice($ids, 0, $maxLength - 1);
-            $ids[] = 102; // [SEP] token ID for BERT
-            $typeIds = array_slice($typeIds, 0, $maxLength);
-        }
-
-        // Pad if needed
-        $paddingLength = $maxLength - count($ids);
-        $attentionMask = array_merge(
-            array_fill(0, count($ids), 1),
-            array_fill(0, $paddingLength, 0)
-        );
-
-        $ids = array_merge($ids, array_fill(0, $paddingLength, 0)); // [PAD] = 0
-        $typeIds = array_merge($typeIds, array_fill(0, $paddingLength, 0));
-
-        $batchInputIds[] = $ids;
-        $batchAttentionMask[] = $attentionMask;
-        $batchTokenTypeIds[] = $typeIds;
-    }
-
-    return [
-        'input_ids' => $batchInputIds,
-        'attention_mask' => $batchAttentionMask,
-        'token_type_ids' => $batchTokenTypeIds,
-    ];
-}
-
-// Process the sentiment samples as a batch
-$texts = array_column($sentimentSamples, 'text');
-$batch = preprocessBatch($tokenizer, $texts, 64);
-
-echo 'Batch processed: '.count($texts)." samples\n";
-echo "Each padded/truncated to: 64 tokens\n";
-echo "Output shapes:\n";
-echo '  input_ids: ['.count($batch['input_ids']).', '.count($batch['input_ids'][0])."]\n";
-echo '  attention_mask: ['.count($batch['attention_mask']).', '.count($batch['attention_mask'][0])."]\n";
-echo '  token_type_ids: ['.count($batch['token_type_ids']).', '.count($batch['token_type_ids'][0])."]\n";
-echo "\nReady for model input!\n";
diff --git a/examples/tokenization_overview.php b/examples/tokenization_overview.php
new file mode 100644
index 0000000..51fa527
--- /dev/null
+++ b/examples/tokenization_overview.php
@@ -0,0 +1,59 @@
+<?php
+
+declare(strict_types=1);
+
+require __DIR__.'/../vendor/autoload.php';
+
+use Codewithkyrian\Tokenizers\Tokenizer;
+
+/**
+ * Minimal tokenization overview.
+ *
+ * - Loads a few popular models from the Hub
+ * - Tokenizes example texts
+ * - Prints token counts and a preview of tokens / ids
+ *
+ * Run with:
+ *   php examples/tokenization_overview.php
+ */
+
+$models = [
+    'BERT (uncased)' => 'google-bert/bert-base-uncased',
+    'GPT-2'          => 'openai-community/gpt2',
+    'Qwen3 Embedding'      => 'Qwen/Qwen3-Embedding-0.6B',
+];
+
+$samples = [
+    'Short sentence' => 'Hello, how are you doing today?',
+    'Code snippet'   => 'function sum(int $a, int $b): int { return $a + $b; }',
+    'Mixed content'  => 'Paris is the capital of France. 42 🧠',
+];
+
+echo "=== Tokenizers PHP - Tokenization Overview ===\n\n";
+
+foreach ($models as $label => $modelId) {
+    echo "Model: {$label}\n";
+    echo "Hub ID: {$modelId}\n";
+
+    $tokenizer = Tokenizer::fromHub($modelId);
+
+    foreach ($samples as $sampleLabel => $text) {
+        $encoding = $tokenizer->encode($text);
+
+        $ids = $encoding->ids;
+        $tokens = $encoding->tokens;
+
+        $count = \count($ids);
+        $idsPreview = implode(', ', array_slice($ids, 0, 10));
+        $tokensPreview = implode(' ', array_slice($tokens, 0, 10));
+
+        echo "- {$sampleLabel}:\n";
+        echo "  Text: {$text}\n";
+        echo "  Token count: {$count}\n";
+        echo "  IDs (first 10): {$idsPreview}".($count > 10 ? ' ...' : '')."\n";
+        echo "  Tokens (first 10): {$tokensPreview}".($count > 10 ? ' ...' : '')."\n\n";
+    }
+
+    echo str_repeat('-', 60)."\n\n";
+}
+
diff --git a/src/Loaders/HubLoader.php b/src/Loaders/HubLoader.php
index e6f189c..4529fe4 100644
--- a/src/Loaders/HubLoader.php
+++ b/src/Loaders/HubLoader.php
@@ -4,494 +4,58 @@
 
 namespace Codewithkyrian\Tokenizers\Loaders;
 
+use Codewithkyrian\HuggingFace\HuggingFace;
 use Codewithkyrian\Tokenizers\Contracts\ConfigLoaderInterface;
-use Http\Discovery\Psr17FactoryDiscovery;
-use Http\Discovery\Psr18ClientDiscovery;
-use Psr\Http\Client\ClientInterface;
-use Psr\Http\Message\RequestFactoryInterface;
-use Psr\Http\Message\RequestInterface;
-use Psr\Http\Message\ResponseInterface;
-use Psr\Http\Message\UriFactoryInterface;
 
+/**
+ * Loads tokenizer configuration from the Hugging Face Hub.
+ */
 class HubLoader implements ConfigLoaderInterface
 {
-    protected const HF_ENDPOINT = 'https://huggingface.co';
-    protected const TOKENIZERS_VERSION = '0.1.0';
-
-    protected ClientInterface $client;
-    protected RequestFactoryInterface $requestFactory;
-    protected UriFactoryInterface $uriFactory;
-
-    protected ?string $resolvedCacheDir = null;
+    private const TOKENIZER_FILES = ['tokenizer.json', 'tokenizer_config.json'];
 
     public function __construct(
         protected ?string $cacheDir = null,
-        protected ?string $revision = 'main',
+        protected string $revision = 'main',
         protected ?string $token = null
-    ) {
-        $this->client = Psr18ClientDiscovery::find();
-        $this->requestFactory = Psr17FactoryDiscovery::findRequestFactory();
-        $this->uriFactory = Psr17FactoryDiscovery::findUriFactory();
-        $this->resolvedCacheDir = $this->resolveCacheDir();
-    }
+    ) {}
 
     public function load(string ...$source): array
     {
         if (0 === \count($source)) {
-            throw new \Exception('A model ID must be provided.');
+            throw new \InvalidArgumentException('A model ID must be provided.');
         }
 
         $modelId = $source[0];
 
-        $encodedSource = implode('/', array_map('rawurlencode', explode('/', $modelId)));
-        $encodedRevision = rawurlencode($this->revision);
-
-        $tokenizerUrl = \sprintf(
-            '%s/%s/resolve/%s/tokenizer.json',
-            self::HF_ENDPOINT,
-            $encodedSource,
-            $encodedRevision
-        );
-
-        $tokenizerConfigUrl = \sprintf(
-            '%s/%s/resolve/%s/tokenizer_config.json',
-            self::HF_ENDPOINT,
-            $encodedSource,
-            $encodedRevision
-        );
-
-        $bundle = $this->loadFromBundleCache($tokenizerUrl, $tokenizerConfigUrl);
-        if (null !== $bundle) {
-            return $bundle;
-        }
-
-        [$tokenizerJson, $tokenizerPath, $tokenizerEtag] = $this->downloadJson($tokenizerUrl, 'tokenizer.json', $modelId);
-        [$tokenizerConfig, $tokenizerConfigPath, $tokenizerConfigEtag] = $this->downloadJson($tokenizerConfigUrl, 'tokenizer_config.json', $modelId, optional: true);
+        $factory = HuggingFace::factory();
 
-        if (null !== $this->resolvedCacheDir) {
-            $this->cacheBundle(
-                $tokenizerUrl,
-                $tokenizerPath,
-                $tokenizerEtag,
-                $tokenizerConfigUrl,
-                $tokenizerConfigPath,
-                $tokenizerConfigEtag
-            );
+        if (null !== $this->token) {
+            $factory = $factory->withToken($this->token);
         }
 
-        return $this->mergeConfigs($tokenizerJson, $tokenizerConfig);
-    }
-
-    /**
-     * Sends a request and follows HTTP redirects until a non-redirect response is received.
-     *
-     * @param RequestInterface $request      the initial request
-     * @param int              $maxRedirects maximum number of redirects to follow (default: 5)
-     *
-     * @return ResponseInterface the final response after following redirects
-     *
-     * @throws \Exception if too many redirects occur or if a redirect fails
-     */
-    protected function sendRequest(RequestInterface $request, int $maxRedirects = 5): ResponseInterface
-    {
-        $redirectCount = 0;
-        $currentRequest = $request;
-
-        while ($redirectCount < $maxRedirects) {
-            $response = $this->client->sendRequest($currentRequest);
-            $statusCode = $response->getStatusCode();
-
-            // Check if this is a redirect (3xx status code)
-            if ($statusCode >= 300 && $statusCode < 400) {
-                $location = $response->getHeaderLine('Location');
-                if (empty($location)) {
-                    throw new \Exception('Received redirect response without Location header');
-                }
-
-                if (preg_match('/^https?:\/\//', $location)) {
-                    $parsed = parse_url($location);
-                    if (false !== $parsed) {
-                        $uri = $this->uriFactory->createUri()
-                            ->withScheme($parsed['scheme'] ?? 'https')
-                            ->withHost($parsed['host'] ?? '')
-                            ->withPort($parsed['port'] ?? null)
-                            ->withPath($parsed['path'] ?? '/')
-                            ->withQuery($parsed['query'] ?? '')
-                            ->withFragment($parsed['fragment'] ?? '')
-                        ;
-
-                        $currentRequest = $this->requestFactory->createRequest('GET', $uri);
-                    } else {
-                        $currentRequest = $this->requestFactory->createRequest('GET', $location);
-                    }
-                } else {
-                    $parsed = parse_url($location);
-
-                    $newUri = $currentRequest->getUri()
-                        ->withQuery($parsed['query'] ?? '')
-                        ->withFragment($parsed['fragment'] ?? '')
-                    ;
-
-                    $locationPath = $parsed['path'] ?? $location;
-
-                    if (str_starts_with($location, '/')) {
-                        $newUri = $newUri->withPath($locationPath);
-                    } else {
-                        $basePath = $newUri->getPath();
-                        $basePath = '.' === \dirname($basePath) ? '/' : \dirname($basePath);
-                        $newUri = $newUri->withPath(rtrim($basePath, '/').'/'.$locationPath);
-                    }
-
-                    $currentRequest = $this->requestFactory->createRequest('GET', $newUri);
-                }
-
-                $currentRequest = $currentRequest
-                    ->withHeader('User-Agent', 'tokenizers/'.self::TOKENIZERS_VERSION.'; PHP')
-                ;
-
-                if ($this->token) {
-                    $currentRequest = $currentRequest->withHeader('Authorization', "Bearer {$this->token}");
-                }
-
-                ++$redirectCount;
-
-                continue;
-            }
-
-            return $response;
-        }
-
-        throw new \Exception("Too many redirects (max: {$maxRedirects})");
-    }
-
-    /**
-     * Resolves the cache directory based on OS.
-     *
-     * @return null|string the cache directory path, or null if it cannot be determined
-     */
-    protected function resolveCacheDir(): ?string
-    {
         if (null !== $this->cacheDir) {
-            return $this->ensureCacheDir($this->cacheDir);
+            $factory = $factory->withCacheDir($this->cacheDir);
         }
 
-        $envCache = getenv('TOKENIZERS_CACHE');
-        if (false !== $envCache) {
-            return $this->ensureCacheDir($envCache);
-        }
-
-        $baseDir = $this->getOSCacheBaseDir();
-        if (null === $baseDir) {
-            return null;
-        }
-
-        $cacheDir = $baseDir.\DIRECTORY_SEPARATOR.'huggingface'.\DIRECTORY_SEPARATOR.'tokenizers';
-
-        return $this->ensureCacheDir($cacheDir);
-    }
-
-    /**
-     * Gets the OS-specific base cache directory.
-     *
-     * @return null|string the base cache directory, or null if it cannot be determined
-     */
-    protected function getOSCacheBaseDir(): ?string
-    {
-        if (\PHP_OS_FAMILY === 'Windows') {
-            $localAppData = getenv('LOCALAPPDATA');
+        $hf = $factory->make();
 
-            return false !== $localAppData ? $localAppData : null;
-        }
-
-        if (\PHP_OS_FAMILY === 'Darwin') {
-            $home = getenv('HOME');
-
-            return false !== $home ? $home.\DIRECTORY_SEPARATOR.'Library'.\DIRECTORY_SEPARATOR.'Caches' : null;
-        }
+        $repo = $hf->hub()
+            ->repo($modelId)
+            ->revision($this->revision);
 
-        $xdgCache = getenv('XDG_CACHE_HOME');
-        if (false !== $xdgCache) {
-            return $xdgCache;
-        }
-
-        $home = getenv('HOME');
-
-        return false !== $home ? $home.\DIRECTORY_SEPARATOR.'.cache' : null;
-    }
-
-    /**
-     * Ensures the cache directory exists and returns the path.
-     *
-     * @param string $dir the directory path
-     *
-     * @return null|string the directory path, or null if it cannot be created
-     */
-    protected function ensureCacheDir(string $dir): ?string
-    {
-        if (!is_dir($dir)) {
-            if (!@mkdir($dir, 0755, true) && !is_dir($dir)) {
-                return null;
-            }
-        }
-
-        return $dir;
-    }
-
-    /**
-     * Gets the cached path for a URL if it exists and is valid.
-     *
-     * @param string $url the URL to check
-     *
-     * @return null|string the cached file path, or null if not cached or invalid
-     */
-    protected function getCachedPath(string $url): ?string
-    {
-        $fsum = hash('sha256', $url);
-        $metaPattern = $this->resolvedCacheDir.\DIRECTORY_SEPARATOR.$fsum.'.*.meta';
-        $metaFiles = glob($metaPattern);
-
-        if (empty($metaFiles)) {
-            return null;
-        }
-
-        $latestMeta = null;
-        $latestTime = 0;
-        foreach ($metaFiles as $metaFile) {
-            $content = file_get_contents($metaFile);
-            if (false === $content) {
-                continue;
-            }
-
-            $meta = json_decode($content, true);
-            if (null === $meta) {
-                continue;
-            }
-
-            $creationTime = $meta['creation_time'] ?? 0;
-            if ($creationTime > $latestTime) {
-                $latestTime = $creationTime;
-                $latestMeta = $meta;
-            }
-        }
-
-        if (null === $latestMeta || !isset($latestMeta['resource_path'])) {
-            return null;
-        }
-
-        $resourcePath = $latestMeta['resource_path'];
-        if (!file_exists($resourcePath)) {
-            return null;
-        }
-
-        // Trust the cache if file exists and metadata is valid
-        // The etag-based filename (fsum.esum) provides uniqueness
-        return $resourcePath;
-    }
-
-    /**
-     * Attempt to load both tokenizer.json and tokenizer_config.json from a bundled cache entry.
-     *
-     * @return null|array<string, mixed>
-     */
-    protected function loadFromBundleCache(string $tokenizerUrl, string $tokenizerConfigUrl): ?array
-    {
-        if (null === $this->resolvedCacheDir) {
-            return null;
-        }
-
-        $bundleKey = hash('sha256', $tokenizerUrl.'|'.$tokenizerConfigUrl);
-        $metaPath = $this->resolvedCacheDir.\DIRECTORY_SEPARATOR.$bundleKey.'.bundle.meta';
-
-        if (!file_exists($metaPath)) {
-            return null;
-        }
-
-        $metaContent = file_get_contents($metaPath);
-        if (false === $metaContent) {
-            return null;
-        }
-
-        $meta = json_decode($metaContent, true);
-        if (!\is_array($meta)) {
-            return null;
-        }
-
-        $tokPath = $meta['tokenizer_path'] ?? null;
-        $tokCfgPath = $meta['tokenizer_config_path'] ?? null;
-
-        if (!$tokPath || !file_exists($tokPath)) {
-            return null;
-        }
-
-        $tokenizerContent = file_get_contents($tokPath);
-        if (false === $tokenizerContent) {
-            return null;
-        }
-
-        $tokenizerJson = json_decode($tokenizerContent, true);
-        if (\JSON_ERROR_NONE !== json_last_error()) {
-            return null;
-        }
-
-        $tokenizerConfig = null;
-        if ($tokCfgPath && file_exists($tokCfgPath)) {
-            $tokenizerConfigContent = file_get_contents($tokCfgPath);
-            if (false === $tokenizerConfigContent) {
-                return null;
-            }
-
-            $tokenizerConfig = json_decode($tokenizerConfigContent, true);
-            if (\JSON_ERROR_NONE !== json_last_error()) {
-                $tokenizerConfig = null;
-            }
-        }
-
-        return $this->mergeConfigs($tokenizerJson, $tokenizerConfig);
-    }
-
-    /**
-     * Download a JSON resource, using cache when available.
-     *
-     * @param bool $optional treat 404 as optional
-     *
-     * @return array{0: null|array<string, mixed>, 1: null|string, 2: null|string} [json, path, etag]
-     *
-     * @throws \Exception
-     */
-    protected function downloadJson(string $url, string $label, string $source, bool $optional = false): array
-    {
-        // Try cache for this resource
-        $cachedPath = $this->resolvedCacheDir ? $this->getCachedPath($url) : null;
-        if (null !== $cachedPath && file_exists($cachedPath)) {
-            $cachedContent = file_get_contents($cachedPath);
-            if (false === $cachedContent) {
-                return [null, null, null];
-            }
-
-            $json = json_decode($cachedContent, true);
-            if (\JSON_ERROR_NONE === json_last_error()) {
-                return [$json, $cachedPath, null];
-            }
-        }
-
-        $request = $this->requestFactory->createRequest('GET', $url)
-            ->withHeader('User-Agent', 'tokenizers/'.self::TOKENIZERS_VERSION.'; PHP')
-        ;
-
-        if ($this->token) {
-            $request = $request->withHeader('Authorization', "Bearer {$this->token}");
-        }
-
-        try {
-            $response = $this->sendRequest($request);
-        } catch (\Exception $e) {
-            throw new \Exception("Failed to load {$label} from Hub for model {$source}: ".$e->getMessage(), 0, $e);
-        }
-
-        $status = $response->getStatusCode();
-        if ($optional && 404 === $status) {
-            return [null, null, null];
-        }
-
-        $content = (string) $response->getBody();
-        if (200 !== $status) {
-            throw new \Exception("Failed to load {$label} from Hub for model {$source}: ".$content);
-        }
-
-        $json = json_decode($content, true);
-        if (\JSON_ERROR_NONE !== json_last_error()) {
-            throw new \Exception("Invalid JSON in {$label} from {$source}: ".json_last_error_msg());
-        }
-
-        $etag = $response->getHeaderLine('ETag') ?: null;
-        $path = null;
-
-        if (null !== $this->resolvedCacheDir && null !== $etag) {
-            $path = $this->cacheResponse($url, $content, $etag);
-        }
-
-        return [$json, $path, $etag];
-    }
-
-    /**
-     * Cache a response with a provided ETag, returning the resource path.
-     */
-    protected function cacheResponse(string $url, string $content, string $etag): ?string
-    {
-        $fsum = hash('sha256', $url);
-        $esum = hash('sha256', $etag);
-        $resourcePath = $this->resolvedCacheDir.\DIRECTORY_SEPARATOR.$fsum.'.'.$esum;
-        $metaPath = $resourcePath.'.meta';
-        $lockPath = $resourcePath.'.lock';
-
-        file_put_contents($lockPath, '');
-
-        try {
-            file_put_contents($resourcePath, $content);
+        $repo->snapshot(
+            allowPatterns: self::TOKENIZER_FILES,
+            force: false
+        );
 
-            $meta = [
-                'resource' => $url,
-                'resource_path' => $resourcePath,
-                'meta_path' => $metaPath,
-                'etag' => $etag,
-                'expires' => null,
-                'creation_time' => microtime(true),
-            ];
+        $tokenizer = $repo->download('tokenizer.json')->json();
 
-            file_put_contents($metaPath, json_encode($meta, \JSON_PRETTY_PRINT));
-        } finally {
-            if (file_exists($lockPath)) {
-                unlink($lockPath);
-            }
+        $tokenizerConfig = [];
+        if ($repo->fileExists('tokenizer_config.json')) {
+            $tokenizerConfig = $repo->download('tokenizer_config.json')->json();
         }
 
-        return $resourcePath;
-    }
-
-    /**
-     * Cache a bundle (tokenizer.json + tokenizer_config.json) into a single meta file.
-     */
-    protected function cacheBundle(
-        string $tokenizerUrl,
-        ?string $tokenizerPath,
-        ?string $tokenizerEtag,
-        string $tokenizerConfigUrl,
-        ?string $tokenizerConfigPath,
-        ?string $tokenizerConfigEtag
-    ): void {
-        if (null === $this->resolvedCacheDir) {
-            return;
-        }
-
-        $bundleKey = hash('sha256', $tokenizerUrl.'|'.$tokenizerConfigUrl);
-        $metaPath = $this->resolvedCacheDir.\DIRECTORY_SEPARATOR.$bundleKey.'.bundle.meta';
-
-        $meta = [
-            'tokenizer_url' => $tokenizerUrl,
-            'tokenizer_path' => $tokenizerPath,
-            'tokenizer_etag' => $tokenizerEtag,
-            'tokenizer_config_url' => $tokenizerConfigUrl,
-            'tokenizer_config_path' => $tokenizerConfigPath,
-            'tokenizer_config_etag' => $tokenizerConfigEtag,
-            'creation_time' => microtime(true),
-        ];
-
-        file_put_contents($metaPath, json_encode($meta, \JSON_PRETTY_PRINT));
-    }
-
-    /**
-     * Merge tokenizer.json with tokenizer_config.json (config wins).
-     *
-     * @param null|array<string, mixed> $tokenizer
-     * @param null|array<string, mixed> $tokenizerConfig
-     *
-     * @return array<string, mixed>
-     */
-    protected function mergeConfigs(?array $tokenizer, ?array $tokenizerConfig): array
-    {
-        $tokenizer ??= [];
-        $tokenizerConfig ??= [];
-
         return array_merge($tokenizer, $tokenizerConfig);
     }
 }

From 552a24d62d92838ef7d6f8616a625aa83eaeee72 Mon Sep 17 00:00:00 2001
From: Kyrian Obikwelu <koshnawaza@gmail.com>
Date: Wed, 4 Feb 2026 12:11:07 +0100
Subject: [PATCH 2/2] chore: code style fixes

---
 examples/tokenization_overview.php | 12 +++++-------
 src/Loaders/HubLoader.php          |  3 ++-
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/examples/tokenization_overview.php b/examples/tokenization_overview.php
index 51fa527..daded47 100644
--- a/examples/tokenization_overview.php
+++ b/examples/tokenization_overview.php
@@ -16,17 +16,16 @@
  * Run with:
  *   php examples/tokenization_overview.php
  */
-
 $models = [
     'BERT (uncased)' => 'google-bert/bert-base-uncased',
-    'GPT-2'          => 'openai-community/gpt2',
-    'Qwen3 Embedding'      => 'Qwen/Qwen3-Embedding-0.6B',
+    'GPT-2' => 'openai-community/gpt2',
+    'Qwen3 Embedding' => 'Qwen/Qwen3-Embedding-0.6B',
 ];
 
 $samples = [
     'Short sentence' => 'Hello, how are you doing today?',
-    'Code snippet'   => 'function sum(int $a, int $b): int { return $a + $b; }',
-    'Mixed content'  => 'Paris is the capital of France. 42 🧠',
+    'Code snippet' => 'function sum(int $a, int $b): int { return $a + $b; }',
+    'Mixed content' => 'Paris is the capital of France. 42 🧠',
 ];
 
 echo "=== Tokenizers PHP - Tokenization Overview ===\n\n";
@@ -43,7 +42,7 @@
         $ids = $encoding->ids;
         $tokens = $encoding->tokens;
 
-        $count = \count($ids);
+        $count = count($ids);
         $idsPreview = implode(', ', array_slice($ids, 0, 10));
         $tokensPreview = implode(' ', array_slice($tokens, 0, 10));
 
@@ -56,4 +55,3 @@
 
     echo str_repeat('-', 60)."\n\n";
 }
-
diff --git a/src/Loaders/HubLoader.php b/src/Loaders/HubLoader.php
index 4529fe4..bfd20fc 100644
--- a/src/Loaders/HubLoader.php
+++ b/src/Loaders/HubLoader.php
@@ -42,7 +42,8 @@ public function load(string ...$source): array
 
         $repo = $hf->hub()
             ->repo($modelId)
-            ->revision($this->revision);
+            ->revision($this->revision)
+        ;
 
         $repo->snapshot(
             allowPatterns: self::TOKENIZER_FILES,