Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -157,9 +157,6 @@ $maxLength = $tokenizer->getConfig('model_max_length'); // 512
$cleanup = $tokenizer->getConfig('clean_up_tokenization_spaces'); // true
$custom = $tokenizer->getConfig('unknown_key', 'default'); // 'default'

// Convenience property for model_max_length
echo $tokenizer->modelMaxLength; // 512

// Get all configuration (pass null or no arguments)
$allConfig = $tokenizer->getConfig();
```
Expand All @@ -170,8 +167,6 @@ Common configuration keys:
- `do_lowercase_and_remove_accent` — Whether to lowercase and strip accents
- `clean_up_tokenization_spaces` — Whether to clean up spaces during decoding

> **Note:** `model_max_length` is the tokenizer's configured max length, not necessarily the model's actual context window. For most models, these are the same. However, some tokenizers (like Llama 3) set this to an extremely large value. When building applications, you may want to use known context window limits for specific models rather than relying solely on this value.

## Encoding Text

The `encode()` method tokenizes text and returns an `Encoding` object containing the token IDs, tokens, and type IDs.
Expand Down
11 changes: 2 additions & 9 deletions src/Tokenizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,6 @@

readonly class Tokenizer
{
/**
* The model's maximum sequence length (convenience accessor for config).
*/
public ?int $modelMaxLength;
protected DictionarySplitter $addedTokensSplitter;

/**
Expand All @@ -45,14 +41,11 @@ public function __construct(
public PreTokenizerInterface $preTokenizer,
public PostProcessorInterface $postProcessor,
public DecoderInterface $decoder,
protected array $specialTokens = [],
protected array $addedTokens = [],
public array $specialTokens = [],
public array $addedTokens = [],
protected array $config = []
) {
$this->addedTokensSplitter = new DictionarySplitter(array_keys($this->addedTokens));

$maxLength = $this->config['model_max_length'] ?? null;
$this->modelMaxLength = null !== $maxLength ? (int) $maxLength : null;
}

/**
Expand Down
23 changes: 2 additions & 21 deletions tests/Unit/TokenizerBuilderTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ public function getConfig(?string $key = null, mixed $default = null): mixed
->build()
;

expect($tokenizer->modelMaxLength)->toBe(512)
expect($tokenizer->getConfig('model_max_length'))->toBe(512)
->and($tokenizer->getConfig('remove_space'))->toBeTrue()
->and($tokenizer->getConfig('clean_up_tokenization_spaces'))->toBeFalse()
->and($tokenizer->getConfig('custom_option'))->toBe('custom_value')
Expand Down Expand Up @@ -194,23 +194,4 @@ public function getConfig(?string $key = null, mixed $default = null): mixed
// Should not throw, defaults are used
$encoding = $tokenizer->encode('HELLO WORLD');
expect($encoding->ids)->toBeArray();
});

it('sets modelMaxLength from config', function () {
$tokenizer = (new TokenizerBuilder())
->withModel(createMockModel())
->withConfig('model_max_length', 2048)
->build()
;

expect($tokenizer->modelMaxLength)->toBe(2048);
});

it('has null modelMaxLength when not configured', function () {
$tokenizer = (new TokenizerBuilder())
->withModel(createMockModel())
->build()
;

expect($tokenizer->modelMaxLength)->toBeNull();
});
});