From 426cea05fb85cecd467567e070604d7209715e73 Mon Sep 17 00:00:00 2001
From: Kyrian Obikwelu <koshnawaza@gmail.com>
Date: Fri, 6 Feb 2026 11:16:45 +0100
Subject: [PATCH] refactor: remove redundant modelMaxLength and expose token
 arrays

---
 README.md                           |  5 -----
 src/Tokenizer.php                   | 11 ++---------
 tests/Unit/TokenizerBuilderTest.php | 23 ++---------------------
 3 files changed, 4 insertions(+), 35 deletions(-)

diff --git a/README.md b/README.md
index 84f40c4..ab767ec 100644
--- a/README.md
+++ b/README.md
@@ -157,9 +157,6 @@ $maxLength = $tokenizer->getConfig('model_max_length');           // 512
 $cleanup = $tokenizer->getConfig('clean_up_tokenization_spaces'); // true
 $custom = $tokenizer->getConfig('unknown_key', 'default');        // 'default'
 
-// Convenience property for model_max_length
-echo $tokenizer->modelMaxLength; // 512
-
 // Get all configuration (pass null or no arguments)
 $allConfig = $tokenizer->getConfig();
 ```
@@ -170,8 +167,6 @@ Common configuration keys:
 - `do_lowercase_and_remove_accent` — Whether to lowercase and strip accents
 - `clean_up_tokenization_spaces` — Whether to clean up spaces during decoding
 
-> **Note:** `model_max_length` is the tokenizer's configured max length, not necessarily the model's actual context window. For most models, these are the same. However, some tokenizers (like Llama 3) set this to an extremely large value. When building applications, you may want to use known context window limits for specific models rather than relying solely on this value.
-
 ## Encoding Text
 
 The `encode()` method tokenizes text and returns an `Encoding` object containing the token IDs, tokens, and type IDs.
diff --git a/src/Tokenizer.php b/src/Tokenizer.php
index 7d2223a..fd40019 100644
--- a/src/Tokenizer.php
+++ b/src/Tokenizer.php
@@ -23,10 +23,6 @@
 
 readonly class Tokenizer
 {
-    /**
-     * The model's maximum sequence length (convenience accessor for config).
-     */
-    public ?int $modelMaxLength;
     protected DictionarySplitter $addedTokensSplitter;
 
     /**
@@ -45,14 +41,11 @@ public function __construct(
         public PreTokenizerInterface $preTokenizer,
         public PostProcessorInterface $postProcessor,
         public DecoderInterface $decoder,
-        protected array $specialTokens = [],
-        protected array $addedTokens = [],
+        public array $specialTokens = [],
+        public array $addedTokens = [],
         protected array $config = []
     ) {
         $this->addedTokensSplitter = new DictionarySplitter(array_keys($this->addedTokens));
-
-        $maxLength = $this->config['model_max_length'] ?? null;
-        $this->modelMaxLength = null !== $maxLength ? (int) $maxLength : null;
     }
 
     /**
diff --git a/tests/Unit/TokenizerBuilderTest.php b/tests/Unit/TokenizerBuilderTest.php
index fec5966..cea040b 100644
--- a/tests/Unit/TokenizerBuilderTest.php
+++ b/tests/Unit/TokenizerBuilderTest.php
@@ -157,7 +157,7 @@ public function getConfig(?string $key = null, mixed $default = null): mixed
         ->build()
     ;
 
-    expect($tokenizer->modelMaxLength)->toBe(512)
+    expect($tokenizer->getConfig('model_max_length'))->toBe(512)
         ->and($tokenizer->getConfig('remove_space'))->toBeTrue()
         ->and($tokenizer->getConfig('clean_up_tokenization_spaces'))->toBeFalse()
         ->and($tokenizer->getConfig('custom_option'))->toBe('custom_value')
@@ -194,23 +194,4 @@ public function getConfig(?string $key = null, mixed $default = null): mixed
     // Should not throw, defaults are used
     $encoding = $tokenizer->encode('HELLO WORLD');
     expect($encoding->ids)->toBeArray();
-});
-
-it('sets modelMaxLength from config', function () {
-    $tokenizer = (new TokenizerBuilder())
-        ->withModel(createMockModel())
-        ->withConfig('model_max_length', 2048)
-        ->build()
-    ;
-
-    expect($tokenizer->modelMaxLength)->toBe(2048);
-});
-
-it('has null modelMaxLength when not configured', function () {
-    $tokenizer = (new TokenizerBuilder())
-        ->withModel(createMockModel())
-        ->build()
-    ;
-
-    expect($tokenizer->modelMaxLength)->toBeNull();
-});
+});
\ No newline at end of file