From 426cea05fb85cecd467567e070604d7209715e73 Mon Sep 17 00:00:00 2001 From: Kyrian Obikwelu Date: Fri, 6 Feb 2026 11:16:45 +0100 Subject: [PATCH] refactor: remove redundant modelMaxLength and expose token arrays --- README.md | 5 ----- src/Tokenizer.php | 11 ++--------- tests/Unit/TokenizerBuilderTest.php | 23 ++--------------------- 3 files changed, 4 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 84f40c4..ab767ec 100644 --- a/README.md +++ b/README.md @@ -157,9 +157,6 @@ $maxLength = $tokenizer->getConfig('model_max_length'); // 512 $cleanup = $tokenizer->getConfig('clean_up_tokenization_spaces'); // true $custom = $tokenizer->getConfig('unknown_key', 'default'); // 'default' -// Convenience property for model_max_length -echo $tokenizer->modelMaxLength; // 512 - // Get all configuration (pass null or no arguments) $allConfig = $tokenizer->getConfig(); ``` @@ -170,8 +167,6 @@ Common configuration keys: - `do_lowercase_and_remove_accent` — Whether to lowercase and strip accents - `clean_up_tokenization_spaces` — Whether to clean up spaces during decoding -> **Note:** `model_max_length` is the tokenizer's configured max length, not necessarily the model's actual context window. For most models, these are the same. However, some tokenizers (like Llama 3) set this to an extremely large value. When building applications, you may want to use known context window limits for specific models rather than relying solely on this value. - ## Encoding Text The `encode()` method tokenizes text and returns an `Encoding` object containing the token IDs, tokens, and type IDs. diff --git a/src/Tokenizer.php b/src/Tokenizer.php index 7d2223a..fd40019 100644 --- a/src/Tokenizer.php +++ b/src/Tokenizer.php @@ -23,10 +23,6 @@ readonly class Tokenizer { - /** - * The model's maximum sequence length (convenience accessor for config). - */ - public ?int $modelMaxLength; protected DictionarySplitter $addedTokensSplitter; /** @@ -45,14 +41,11 @@ public function __construct( public PreTokenizerInterface $preTokenizer, public PostProcessorInterface $postProcessor, public DecoderInterface $decoder, - protected array $specialTokens = [], - protected array $addedTokens = [], + public array $specialTokens = [], + public array $addedTokens = [], protected array $config = [] ) { $this->addedTokensSplitter = new DictionarySplitter(array_keys($this->addedTokens)); - - $maxLength = $this->config['model_max_length'] ?? null; - $this->modelMaxLength = null !== $maxLength ? (int) $maxLength : null; } /** diff --git a/tests/Unit/TokenizerBuilderTest.php b/tests/Unit/TokenizerBuilderTest.php index fec5966..cea040b 100644 --- a/tests/Unit/TokenizerBuilderTest.php +++ b/tests/Unit/TokenizerBuilderTest.php @@ -157,7 +157,7 @@ public function getConfig(?string $key = null, mixed $default = null): mixed ->build() ; - expect($tokenizer->modelMaxLength)->toBe(512) + expect($tokenizer->getConfig('model_max_length'))->toBe(512) ->and($tokenizer->getConfig('remove_space'))->toBeTrue() ->and($tokenizer->getConfig('clean_up_tokenization_spaces'))->toBeFalse() ->and($tokenizer->getConfig('custom_option'))->toBe('custom_value') @@ -194,23 +194,4 @@ public function getConfig(?string $key = null, mixed $default = null): mixed // Should not throw, defaults are used $encoding = $tokenizer->encode('HELLO WORLD'); expect($encoding->ids)->toBeArray(); -}); - -it('sets modelMaxLength from config', function () { - $tokenizer = (new TokenizerBuilder()) - ->withModel(createMockModel()) - ->withConfig('model_max_length', 2048) - ->build() - ; - - expect($tokenizer->modelMaxLength)->toBe(2048); -}); - -it('has null modelMaxLength when not configured', function () { - $tokenizer = (new TokenizerBuilder()) - ->withModel(createMockModel()) - ->build() - ; - - expect($tokenizer->modelMaxLength)->toBeNull(); -}); +}); \ No newline at end of file