diff --git a/src/Factories/PreTokenizerFactory.php b/src/Factories/PreTokenizerFactory.php index b5dc1e3..9c1340b 100644 --- a/src/Factories/PreTokenizerFactory.php +++ b/src/Factories/PreTokenizerFactory.php @@ -8,6 +8,7 @@ use Codewithkyrian\Tokenizers\PreTokenizers\BertPreTokenizer; use Codewithkyrian\Tokenizers\PreTokenizers\ByteLevelPreTokenizer; use Codewithkyrian\Tokenizers\PreTokenizers\DigitsPreTokenizer; +use Codewithkyrian\Tokenizers\PreTokenizers\FixedLengthPreTokenizer; use Codewithkyrian\Tokenizers\PreTokenizers\MetaspacePreTokenizer; use Codewithkyrian\Tokenizers\PreTokenizers\PreTokenizerSequence; use Codewithkyrian\Tokenizers\PreTokenizers\PunctuationPreTokenizer; @@ -37,6 +38,9 @@ public static function create(array $config): PreTokenizerInterface 'Digits' => new DigitsPreTokenizer( individualDigits: $config['individual_digits'] ?? false ), + 'FixedLength' => new FixedLengthPreTokenizer( + length: $config['length'] + ), 'Metaspace' => new MetaspacePreTokenizer( replacement: $config['replacement'] ?? ' ', addPrefixSpace: $config['add_prefix_space'] ?? true, diff --git a/src/PreTokenizers/FixedLengthPreTokenizer.php b/src/PreTokenizers/FixedLengthPreTokenizer.php new file mode 100644 index 0000000..de87b60 --- /dev/null +++ b/src/PreTokenizers/FixedLengthPreTokenizer.php @@ -0,0 +1,38 @@ +preTokenize($t, $options)); + } + + return $result; + } + + $tokens = []; + $len = mb_strlen($text); + + for ($i = 0; $i < $len; $i += $this->length) { + $tokens[] = mb_substr($text, $i, $this->length); + } + + return $tokens; + } +}