From 8de5bb5456d1f375ee80454b25ea5e08b666df2a Mon Sep 17 00:00:00 2001 From: Phuong Nguyen Date: Wed, 3 Dec 2025 12:29:30 -0800 Subject: [PATCH 01/98] init einsum Signed-off-by: Phuong Nguyen --- tests/jax/test_custom_call_compute.py | 53 +++ tests/jax/test_einsum.py | 219 +++++++++ transformer_engine/jax/cpp_extensions/amax.py | 36 ++ transformer_engine/jax/cpp_extensions/base.py | 89 +++- transformer_engine/jax/cpp_extensions/gemm.py | 53 +-- .../jax/cpp_extensions/quantization.py | 40 +- transformer_engine/jax/einsum.py | 424 ++++++++++++++++++ transformer_engine/jax/quantize/tensor.py | 119 +++-- 8 files changed, 930 insertions(+), 103 deletions(-) create mode 100644 tests/jax/test_einsum.py create mode 100644 transformer_engine/jax/einsum.py diff --git a/tests/jax/test_custom_call_compute.py b/tests/jax/test_custom_call_compute.py index c8bd9d47c3..897d9f683e 100644 --- a/tests/jax/test_custom_call_compute.py +++ b/tests/jax/test_custom_call_compute.py @@ -1290,6 +1290,59 @@ def test_quantize_dact_dbias_mxfp8_scaling( ) +class TestQuantizeWithVmap: + """Test vmap support for quantization primitives.""" + + @pytest_parametrize_wrapper("in_dtype", [jnp.bfloat16]) + @pytest_parametrize_wrapper("scaling_mode", supported_scaling_modes) + @pytest_parametrize_wrapper("q_layout", [QuantizeLayout.ROWWISE]) + def test_vmap_quantize(self, in_dtype, scaling_mode, q_layout): + """Test that vmap works with tex.quantize using the general batcher.""" + # Determine q_dtype based on scaling mode + if scaling_mode.is_nvfp4_scaling: + q_dtype = jnp.float4_e2m1fn + else: + q_dtype = jnp.float8_e4m3fn + + # Create batched input (E, M, K) - E experts + E, M, K = 4, 64, 128 + key = jax.random.PRNGKey(0) + batched_input = jax.random.uniform(key, (E, M, K), in_dtype) + + # Create per-expert quantizers + quantizers = [ + QuantizerFactory.create( + q_dtype=q_dtype, + scaling_mode=scaling_mode, + q_layout=q_layout, + ) + for _ in range(E) + ] + + # Stack quantizers for vmap + stacked_quantizers = jax.tree_util.tree_map(lambda *args: jnp.stack(args), *quantizers) + + # Vmap over expert dimension + def quantize_single(x, quantizer): + return tex.quantize(x, quantizer=quantizer, flatten_axis=-1) + + vmapped_quantize = jax.vmap(quantize_single, in_axes=(0, 0)) + result = vmapped_quantize(batched_input, stacked_quantizers) + + # Verify shapes + assert result.data.shape == (E, M, K) + assert result.scale_inv.shape[0] == E # Per-expert scales + + # Compare with calling quantize for each expert individually + individual_results = [] + for i in range(E): + res_i = tex.quantize(batched_input[i], quantizer=quantizers[i], flatten_axis=-1) + individual_results.append(res_i.data) + + expected = jnp.stack(individual_results, axis=0) + assert_allclose(result.data, expected, dtype=quantizers[0].q_dtype) + + valid_fp8_gemm_operand_types = [ (jnp.float8_e4m3fn, jnp.float8_e4m3fn), (jnp.float8_e5m2, jnp.float8_e4m3fn), diff --git a/tests/jax/test_einsum.py b/tests/jax/test_einsum.py new file mode 100644 index 0000000000..39dffa6787 --- /dev/null +++ b/tests/jax/test_einsum.py @@ -0,0 +1,219 @@ +# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. +"""Tests for TE einsum operation with FP8 quantization.""" + +import jax +import jax.numpy as jnp +import pytest +from jax import value_and_grad + +from utils import assert_allclose, pytest_parametrize_wrapper +from transformer_engine.jax.einsum import einsum +from transformer_engine.jax.quantize import ( + QuantizerFactory, + QuantizeMeta, + QuantizeMetaSet, +) +from transformer_engine.jax.quantize import helper + + +# Test parameters +DTYPES = [jnp.bfloat16] +# (B, S, M, E, C, H) +# B: Batch size +# S: Sequence length (number of tokens) +# M: Model dimension (hidden size) +# E: Number of experts +# C: Capacity (max tokens per expert) +# H: Hidden dimension (MLP intermediate size) +MOE_CASES = [ + (2, 32, 128, 4, 32, 64), +] + +# Get supported recipes +supported_recipes = helper.get_supported_quantization_recipes() +supported_recipes = [pytest.param(r, id=r.__class__.__name__) for r in supported_recipes] + + +@pytest.fixture(autouse=True, scope="module") +def init(): + """WAR for CUDA uninitialize error""" + # Calling customcalls before jax may cause CUDA uninitialize error + _ = jnp.zeros(0) + yield + + +class TestMoEMLPWithRecipes: + """Test MoE MLP operations with different FP8 recipes and gradients.""" + + def _get_quantizer_sets(self, recipe, num_experts): + return QuantizerFactory.create_set( + n_quantizer_sets=num_experts, + fp8_recipe=recipe, + quantize_meta_set=QuantizeMetaSet( + x=QuantizeMeta(), kernel=QuantizeMeta(), grad=QuantizeMeta() + ), + ) + + def _einsum(self, equation, *operands, quantizer_sets=None, quantizer_dim=None, fallback=False): + out = einsum( + equation, + *operands, + quantizer_sets=quantizer_sets, + quantizer_dim=quantizer_dim, + fallback=fallback, + ) + return jnp.mean(out) + + def _ref_einsum(self, equation, *operands): + out = jnp.einsum(equation, *operands) + return jnp.mean(out) + + @pytest_parametrize_wrapper("B,S,M,E,C,H", MOE_CASES) + @pytest_parametrize_wrapper("recipe", supported_recipes) + def test_mlp_up_grad(self, B, S, M, E, C, H, recipe): + """Test MLP up: EBCM,EMH->EBCH with gradients and different recipes.""" + # Create per-expert quantizers + quantizer_sets = self._get_quantizer_sets(recipe, E) + dispatched = jax.random.normal( + jax.random.PRNGKey(0), (E, B, C, M), dtype=jnp.bfloat16 + ) / jnp.sqrt(M) + weights = jax.random.normal(jax.random.PRNGKey(1), (E, M, H), dtype=jnp.bfloat16) + + # Compute with TE einsum with quantization + loss_te, grads_te = value_and_grad(self._einsum, argnums=(1, 2))( + "EBCM,EMH->EBCH", dispatched, weights, quantizer_sets=quantizer_sets, quantizer_dim="E" + ) + + # Compute reference (BF16) + loss_ref, grads_ref = value_and_grad(self._ref_einsum, argnums=(1, 2))( + "EBCM,EMH->EBCH", dispatched, weights + ) + + # Verify shapes and no NaNs + assert grads_te[0].shape == dispatched.shape + assert grads_te[1].shape == weights.shape + assert not jnp.isnan(loss_te) + assert jnp.all(jnp.isfinite(grads_te[0])) + assert jnp.all(jnp.isfinite(grads_te[1])) + + # Compare with reference (with FP8 tolerance) + assert_allclose(loss_te, loss_ref, dtype=quantizer_sets[0].x.q_dtype) + assert_allclose(grads_te[0], grads_ref[0], dtype=quantizer_sets[0].dgrad.q_dtype) + assert_allclose(grads_te[1], grads_ref[1], dtype=quantizer_sets[0].dgrad.q_dtype) + + @pytest_parametrize_wrapper("B,S,M,E,C,H", MOE_CASES) + @pytest_parametrize_wrapper("recipe", supported_recipes) + def test_mlp_down_grad(self, B, S, M, E, C, H, recipe): + """Test MLP down: EBCH,EHM->EBCM with gradients and different recipes.""" + # Create per-expert quantizers + quantizer_sets = self._get_quantizer_sets(recipe, E) + + hidden = jax.random.normal( + jax.random.PRNGKey(0), (E, B, C, H), dtype=jnp.bfloat16 + ) / jnp.sqrt(H) + weights = jax.random.normal(jax.random.PRNGKey(1), (E, H, M), dtype=jnp.bfloat16) + + # Compute with TE einsum with quantization + loss_te, grads_te = value_and_grad(self._einsum, argnums=(1, 2))( + "EBCH,EHM->EBCM", hidden, weights, quantizer_sets=quantizer_sets, quantizer_dim="E" + ) + + # Compute reference (BF16) + loss_ref, grads_ref = value_and_grad(self._ref_einsum, argnums=(1, 2))( + "EBCH,EHM->EBCM", hidden, weights + ) + + # Verify shapes and no NaNs + assert grads_te[0].shape == hidden.shape + assert grads_te[1].shape == weights.shape + assert not jnp.isnan(loss_te) + assert jnp.all(jnp.isfinite(grads_te[0])) + assert jnp.all(jnp.isfinite(grads_te[1])) + + # Compare with reference (with FP8 tolerance) + assert_allclose(loss_te, loss_ref, dtype=quantizer_sets[0].x.q_dtype) + assert_allclose(grads_te[0], grads_ref[0], dtype=quantizer_sets[0].dgrad.q_dtype) + assert_allclose(grads_te[1], grads_ref[1], dtype=quantizer_sets[0].dgrad.q_dtype) + + @pytest_parametrize_wrapper("B,S,M,E,C,H", MOE_CASES) + @pytest_parametrize_wrapper("recipe", supported_recipes) + def test_full_moe_grad(self, B, S, M, E, C, H, recipe): + """Test full MoE pipeline (all 4 einsums) with gradients and different recipes.""" + # Create per-expert quantizers for each einsum + mlp_up_quantizer_sets = self._get_quantizer_sets(recipe, E) + mlp_down_quantizer_sets = self._get_quantizer_sets(recipe, E) + + tokens = jax.random.normal(jax.random.PRNGKey(0), (B, S, M), dtype=jnp.bfloat16) / jnp.sqrt(M) + routing = jax.random.normal(jax.random.PRNGKey(1), (B, S, E, C), dtype=jnp.bfloat16) + routing = jax.nn.softmax(routing, axis=-1) # Normalize routing weights + up_weights = jax.random.normal( + jax.random.PRNGKey(2), (E, M, H), dtype=jnp.bfloat16 + ) / jnp.sqrt(H) + down_weights = jax.random.normal( + jax.random.PRNGKey(3), (E, H, M), dtype=jnp.bfloat16 + ) / jnp.sqrt(M) + + # TE implementation with quantization + def full_moe_te(tokens, routing, up_w, down_w): + """Complete MoE pipeline with TE einsum.""" + dispatched = einsum("BSM,BSEC->EBCM", tokens, routing, fallback=True) + hidden = einsum( + "EBCM,EMH->EBCH", + dispatched, + up_w, + quantizer_sets=mlp_up_quantizer_sets, + quantizer_dim="E", + ) + expert_out = einsum( + "EBCH,EHM->EBCM", + hidden, + down_w, + quantizer_sets=mlp_down_quantizer_sets, + quantizer_dim="E", + ) + output = einsum("EBCM,BSEC->BSM", expert_out, routing, fallback=True) + return jnp.sum(output) + + # Reference implementation with jnp.einsum + def full_moe_ref(tokens, routing, up_w, down_w): + """Complete MoE pipeline with jnp.einsum.""" + dispatched = jnp.einsum("BSM,BSEC->EBCM", tokens, routing) + hidden = jnp.einsum("EBCM,EMH->EBCH", dispatched, up_w) + expert_out = jnp.einsum("EBCH,EHM->EBCM", hidden, down_w) + output = jnp.einsum("EBCM,BSEC->BSM", expert_out, routing) + return jnp.sum(output) + + loss_te, grads_te = value_and_grad(full_moe_te, argnums=(0, 1, 2, 3))( + tokens, routing, up_weights, down_weights + ) + + loss_ref, grads_ref = value_and_grad(full_moe_ref, argnums=(0, 1, 2, 3))( + tokens, routing, up_weights, down_weights + ) + + # Verify all gradient shapes + assert grads_te[0].shape == tokens.shape, f"tokens grad shape mismatch" + assert grads_te[1].shape == routing.shape, f"routing grad shape mismatch" + assert grads_te[2].shape == up_weights.shape, f"up_weights grad shape mismatch" + assert grads_te[3].shape == down_weights.shape, f"down_weights grad shape mismatch" + + # Verify no NaNs or Infs + assert not jnp.isnan(loss_te), "Loss is NaN" + assert jnp.isfinite(loss_te), "Loss is Inf" + assert jnp.all(jnp.isfinite(grads_te[0])), "tokens grad has NaN/Inf" + assert jnp.all(jnp.isfinite(grads_te[1])), "routing grad has NaN/Inf" + assert jnp.all(jnp.isfinite(grads_te[2])), "up_weights grad has NaN/Inf" + assert jnp.all(jnp.isfinite(grads_te[3])), "down_weights grad has NaN/Inf" + + # Compare with reference (with FP8 tolerance) + assert_allclose(loss_te, loss_ref, dtype=mlp_up_quantizer_sets[0].x.q_dtype) + assert_allclose(grads_te[0], grads_ref[0], dtype=mlp_up_quantizer_sets[0].dgrad.q_dtype) + assert_allclose(grads_te[1], grads_ref[1], dtype=mlp_up_quantizer_sets[0].dgrad.q_dtype) + assert_allclose(grads_te[2], grads_ref[2], dtype=mlp_down_quantizer_sets[0].x.q_dtype) + assert_allclose(grads_te[3], grads_ref[3], dtype=mlp_down_quantizer_sets[0].dgrad.q_dtype) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/transformer_engine/jax/cpp_extensions/amax.py b/transformer_engine/jax/cpp_extensions/amax.py index 2f3bc402ec..19e229c1ee 100644 --- a/transformer_engine/jax/cpp_extensions/amax.py +++ b/transformer_engine/jax/cpp_extensions/amax.py @@ -160,6 +160,18 @@ def shardy_sharding_rule(amax_scope, transpose_batch_sequence, mesh, value_types output_spec = (f"{prefix}_amax",) return SdyShardingRule((input_spec,), (output_spec,)) + @staticmethod + def batcher(batched_args, batch_dims, *, amax_scope, transpose_batch_sequence): + """Batcher for amax calculation - returns single amax value.""" + return AmaxCalculationPrimitive.batcher_impl( + batched_args, + batch_dims, + static_kwargs={ + "amax_scope": amax_scope, + "transpose_batch_sequence": transpose_batch_sequence, + }, + ) + register_primitive(AmaxCalculationPrimitive, outer_only=True) @@ -370,6 +382,30 @@ def shardy_sharding_rule( output_post_rht_amax_spec = (f"{prefix}_post_rht_amax",) return SdyShardingRule((input_spec,), (output_amax_spec, output_post_rht_amax_spec)) + @staticmethod + def batcher( + batched_args, + batch_dims, + *, + amax_scope, + transpose_batch_sequence, + rht_matrix_random_sign_mask_t, + produce_regular_amax, + flatten_axis, + ): + """Batcher for RHT amax calculation - returns 2 amax values.""" + return RHTAmaxCalculationPrimitive.batcher_impl( + batched_args, + batch_dims, + static_kwargs={ + "amax_scope": amax_scope, + "transpose_batch_sequence": transpose_batch_sequence, + "rht_matrix_random_sign_mask_t": rht_matrix_random_sign_mask_t, + "produce_regular_amax": produce_regular_amax, + "flatten_axis": flatten_axis, + }, + ) + register_primitive(RHTAmaxCalculationPrimitive) diff --git a/transformer_engine/jax/cpp_extensions/base.py b/transformer_engine/jax/cpp_extensions/base.py index 556b587191..9f88265e93 100644 --- a/transformer_engine/jax/cpp_extensions/base.py +++ b/transformer_engine/jax/cpp_extensions/base.py @@ -7,13 +7,14 @@ import warnings from abc import ABCMeta, abstractmethod from functools import partial +from typing import Any, Sequence, Union, Tuple from jax.extend import core from jax.interpreters import xla, mlir from jax.experimental.custom_partitioning import custom_partitioning from jax._src.interpreters import batching from jax._src import dispatch -from jax import ffi +from jax import ffi, numpy as jnp import transformer_engine_jax @@ -168,6 +169,92 @@ def shardy_sharding_rule(*args): del args return "... -> ..." + @classmethod + def batcher_impl( + cls, + batched_args: Sequence[Any], + batch_dims: Sequence[Union[int, None]], + static_kwargs: dict, + ) -> Tuple[Tuple[Any, ...], Tuple[Union[int, None], ...]]: + """Batcher implementation for JAX primitives. + + Implements the standard batching pattern: loop over batch dimension, + call primitive for each slice, and stack results. + + Args: + batched_args: Tuple of input tensors (some may be batched) + batch_dims: Tuple indicating batch dimension for each arg (None if not batched) + static_kwargs: Dictionary of static arguments to pass to primitive.bind() + + Returns: + Tuple of (output_tensors, output_batch_dims) + + Example: + @staticmethod + def batcher(batched_args, batch_dims, *, arg1, arg2, arg3): + return MyPrimitive.batcher_impl( + batched_args, batch_dims, + static_kwargs={'arg1': arg1, 'arg2': arg2, 'arg3': arg3}, + ) + """ + from jax import lax + + # Find batch dimension and validate all batched args have the same batch_dim + batch_dim = None + batch_size = None + for arg, bdim in zip(batched_args, batch_dims): + if bdim is not None: + if batch_dim is None: + batch_dim = bdim + batch_size = arg.shape[bdim] + elif bdim != batch_dim: + raise ValueError( + "All batched arguments must have the same batch dimension. " + f"Got batch_dims={batch_dims}" + ) + assert batch_dim is not None and batch_size is not None, "Invalid batching config!" + + # Loop over batch dimension and collect results + all_results = [] + + for i in range(batch_size): + # Extract slice for each argument + sliced_args = [] + for arg, bdim in zip(batched_args, batch_dims): + if bdim is not None: + slice_i = lax.index_in_dim(arg, i, bdim, keepdims=False) + sliced_args.append(slice_i) + else: # For empty args + sliced_args.append(arg) + + # Call primitive with unbatched slices + result_i = cls.outer_primitive.bind(*sliced_args, **static_kwargs) + + # Normalize to tuple + if not isinstance(result_i, (tuple, list)): + result_i = (result_i,) + elif isinstance(result_i, list): + result_i = tuple(result_i) + + all_results.append(result_i) + + # Transpose: from list of tuples to tuple of lists + # all_results = [(out0_0, out1_0, ...), (out0_1, out1_1, ...), ...] + # transposed = ([out0_0, out0_1, ...], [out1_0, out1_1, ...], ...) + transposed = tuple(zip(*all_results)) + + # Stack each output along the batch dimension + stacked_results = tuple( + jnp.stack(list(out_list), axis=batch_dim) for out_list in transposed + ) + + # Single output: return unwrapped result + if len(stacked_results) == 1: + return stacked_results[0], batch_dim + + # Multiple outputs: return tuple of results + return stacked_results, [batch_dim for _ in stacked_results] + # Registry to store all registered primitive classes _primitive_registry = {} diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py index 76a8b225ba..55a1700838 100644 --- a/transformer_engine/jax/cpp_extensions/gemm.py +++ b/transformer_engine/jax/cpp_extensions/gemm.py @@ -808,40 +808,33 @@ def batcher( sequence_dim, is_outer, ): - del transpose_batch_sequence, sequence_dim, is_outer assert GemmPrimitive.outer_primitive is not None lhs_bdims, _, rhs_bdims, *_ = batch_dims - # Batched GEMM is not supported - assert ( - lhs_bdims is None and rhs_bdims is None - ), f"(Batching is not supported, got lhs_bdims={lhs_bdims}, rhs_bdims={rhs_bdims})" - out_bdims = (None,) - - # Bias gradient is never batched - bias_bdims = (None,) - - # Pre-GeLU output, if exists, is batched like GEMM output - pre_gelu_bdims = (None,) - if fuse_gelu and not grad: - pre_gelu_bdims = out_bdims + # Validate batch dimensions + if lhs_bdims is not None or rhs_bdims is not None: + assert lhs_bdims == rhs_bdims, ( + "Batched GEMM requires matching batch dimensions, " + f"got lhs_bdims={lhs_bdims}, rhs_bdims={rhs_bdims}" + ) - return ( - GemmPrimitive.outer_primitive.bind( - *batched_args, - out_dtype=out_dtype, - contracting_dims=contracting_dims, - scaling_mode=scaling_mode, - fuse_bias=fuse_bias, - fuse_gelu=fuse_gelu, - grad=grad, - use_split_accumulator=use_split_accumulator, - collective_op=collective_op, - transpose_batch_sequence=transpose_batch_sequence, - sequence_dim=sequence_dim, - is_outer=is_outer, - ), - (out_bdims, bias_bdims, pre_gelu_bdims), + # Use general batcher from BasePrimitive + return GemmPrimitive.batcher_impl( + batched_args, + batch_dims, + static_kwargs={ + "out_dtype": out_dtype, + "contracting_dims": contracting_dims, + "scaling_mode": scaling_mode, + "fuse_bias": fuse_bias, + "fuse_gelu": fuse_gelu, + "grad": grad, + "use_split_accumulator": use_split_accumulator, + "collective_op": collective_op, + "transpose_batch_sequence": transpose_batch_sequence, + "sequence_dim": sequence_dim, + "is_outer": is_outer, + }, ) @staticmethod diff --git a/transformer_engine/jax/cpp_extensions/quantization.py b/transformer_engine/jax/cpp_extensions/quantization.py index b3f24e9337..53c6937fb4 100644 --- a/transformer_engine/jax/cpp_extensions/quantization.py +++ b/transformer_engine/jax/cpp_extensions/quantization.py @@ -361,34 +361,24 @@ def batcher( stochastic_rounding, use_rht, ): - """ - to describe batch rules for vmap - """ - del is_outer + """Batch rule for quantization primitive using general batcher.""" check_valid_batch_dims(batch_dims) assert BaseDBiasQuantizePrimitive.outer_primitive is not None - x, scale, amax, sr_rng_state, post_rht_amax, rht_matrix = batched_args - x_bdim, scale_bdim, amax_bdim, _, _, _ = batch_dims - out_bdims = x_bdim, x_bdim, scale_bdim, scale_bdim, amax_bdim, x_bdim - return ( - BaseDBiasQuantizePrimitive.outer_primitive.bind( - x, - scale, - amax, - sr_rng_state, - post_rht_amax, - rht_matrix, - out_dtype=out_dtype, - scaling_mode=scaling_mode, - q_layout=q_layout, - flatten_axis=flatten_axis, - scale_dtype=scale_dtype, - is_dbias=is_dbias, - stochastic_rounding=stochastic_rounding, - use_rht=use_rht, - ), - out_bdims, + return BaseDBiasQuantizePrimitive.batcher_impl( + batched_args, + batch_dims, + static_kwargs={ + "out_dtype": out_dtype, + "scaling_mode": scaling_mode, + "q_layout": q_layout, + "flatten_axis": flatten_axis, + "scale_dtype": scale_dtype, + "is_dbias": is_dbias, + "is_outer": is_outer, + "stochastic_rounding": stochastic_rounding, + "use_rht": use_rht, + }, ) @staticmethod diff --git a/transformer_engine/jax/einsum.py b/transformer_engine/jax/einsum.py new file mode 100644 index 0000000000..20084c77ea --- /dev/null +++ b/transformer_engine/jax/einsum.py @@ -0,0 +1,424 @@ +# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. +"""Einsum operation with FP8 quantization support for Transformer Engine in JAX. + +This module provides an einsum implementation that decomposes einsum operations into +a sequence of GEMMs, each with its own quantizer for FP8 support. It follows the +pattern of jax.numpy.einsum but uses TE's optimized GEMM operations. + +This module provides an einsum implementation optimized for Mixture-of-Experts (MoE) +models with per-expert quantization support. It leverages JAX's vmap and TE's dense +layer to efficiently handle tensor contractions with a single batch dimension. + +Key Features: + - **Per-expert quantization**: Each expert can have independent scaling and quantization parameters + - **Automatic differentiation**: Full gradient support via dense layer's VJP + - **Single batch dimension**: Optimized for MoE patterns (expert dimension) + - **Explicit API**: Requires quantizer_dim when using quantization + +Limitations: + - **NN layout only**: LHS last dim must contract, RHS last dim must not contract + - **Single batch dimension**: Only one batch dimension supported + - **2-operand only**: Only supports binary operations + - **Explicit quantizer_dim**: Required when quantizer_sets is provided + + For operations that don't meet these requirements (e.g., routing operations + like "BSM,BSEC->EBCM"), use jnp.einsum instead, or set fallback=True to + automatically fall back to jnp.einsum when the operation is not supported. + +Example - MoE Forward Pass with Per-Expert FP8: + ```python + from transformer_engine.jax.einsum import einsum + from transformer_engine.jax.quantize import QuantizerFactory, QuantizeMeta, QuantizeMetaSet + + # Create per-expert quantizers (E experts) + quantizer_sets = [ + QuantizerFactory.create_set( + fp8_recipe=recipe, + quantize_meta_set=QuantizeMetaSet( + x=QuantizeMeta(), kernel=QuantizeMeta(), grad=QuantizeMeta() + ) + ) for _ in range(num_experts) + ] + + # MoE pipeline with per-expert quantization, + # 1. Dispatch: BSM,BSEC -> EBCM (no quantization - routing operation) + dispatched = jnp.einsum("BSM,BSEC->EBCM", tokens, routing) + # Or with fallback: + # dispatched = einsum("BSM,BSEC->EBCM", tokens, routing, fallback=True) + + # 2. MLP Up: EBCM,EMH -> EBCH (per-expert quantization) + hidden = einsum("EBCM,EMH->EBCH", dispatched, expert_up_weights, + quantizer_sets=expert_quantizers, quantizer_dim='E') + + # 3. MLP Down: EBCH,EHM -> EBCM (per-expert quantization) + expert_out = einsum("EBCH,EHM->EBCM", hidden, expert_down_weights, + quantizer_sets=expert_quantizers, quantizer_dim='E') + + # 4. Combine: EBCM,BSEC -> BSM (no quantization - routing operation) + output = jnp.einsum("EBCM,BSEC->BSM", expert_out, routing) + # Or with fallback: + # output = einsum("EBCM,BSEC->BSM", expert_out, routing, fallback=True) + ``` + +Implementation Details: + The einsum function works by: + 1. Parsing the einsum equation to identify the single batch dimension and contracting dimensions + 2. Validating that quantizer_sets length matches the quantizer dimension size + 3. Creating a vmapped version of TE's dense layer over the batch dimension + 4. Vmapping over quantizer_sets to provide per-batch (e.g., per-expert) quantization + 5. Leveraging dense's existing VJP for automatic differentiation + + This design reuses TE's well-tested dense layer infrastructure while enabling + per-expert quantization for MoE models with minimal code complexity. +""" + +from typing import Tuple, Optional, List +import jax +import jax.numpy as jnp + +from .dense import dense +from .quantize import ( + QuantizerSet, + noop_quantizer_set, +) + + +def _parse_einsum_input(equation: str, *operands) -> Tuple[str, List[str], str]: + """Parse einsum equation into input specs and output spec. + + Args: + equation: Einsum equation string (e.g., "ij,jk->ik" or "BNSM,BNSEC->EBNCM") + operands: Input tensors + + Returns: + Tuple of (equation, input_specs, output_spec) + + Raises: + ValueError: If number of operands doesn't match equation + """ + # Remove spaces + equation = equation.replace(" ", "") + + if "->" in equation: + inputs_str, output_str = equation.split("->") + input_specs = inputs_str.split(",") + else: + # Implicit output mode + inputs_str = equation + input_specs = inputs_str.split(",") + # Compute implicit output + all_indices = set() + for spec in input_specs: + all_indices.update(spec) + output_str = "".join(sorted(all_indices)) + + # Validate each operand's ndim matches its spec + for i, (operand, spec) in enumerate(zip(operands, input_specs)): + expected_ndim = len(spec) + actual_ndim = operand.ndim + if actual_ndim != expected_ndim: + raise ValueError( + f"Operand {i} has {actual_ndim} dimensions but equation '{equation}' " + f"expects {expected_ndim} dimensions (spec: '{spec}'). " + f"Operand shape: {operand.shape}" + ) + + return equation, input_specs, output_str + + +def _find_contracting_and_batch_dims(lhs_spec: str, rhs_spec: str, output_spec: str): + """Find contracting and batch dimensions for a GEMM operation. + + Args: + lhs_spec: Index specification for LHS (e.g., "BNSM") + rhs_spec: Index specification for RHS (e.g., "BNSEC") + output_spec: Index specification for output (e.g., "EBNCM") + + Returns: + Tuple of (lhs_contracting, rhs_contracting, lhs_batch, rhs_batch) + """ + # Contracting dimensions: indices in both lhs and rhs but not in output + lhs_set = set(lhs_spec) + rhs_set = set(rhs_spec) + output_set = set(output_spec) + + contracting_indices = (lhs_set & rhs_set) - output_set + + # Batch dimensions: indices in lhs, rhs, and output + batch_indices = lhs_set & rhs_set & output_set + + # Find positions + lhs_contracting = tuple(i for i, c in enumerate(lhs_spec) if c in contracting_indices) + rhs_contracting = tuple(i for i, c in enumerate(rhs_spec) if c in contracting_indices) + lhs_batch = tuple(i for i, c in enumerate(lhs_spec) if c in batch_indices) + rhs_batch = tuple(i for i, c in enumerate(rhs_spec) if c in batch_indices) + + return lhs_contracting, rhs_contracting, lhs_batch, rhs_batch + + +def _einsum_to_gemm_info(equation: str, *operands): + """Extract GEMM information from einsum equation. + + Args: + equation: Einsum equation + operands: Input tensors + + Returns: + Dict with keys: lhs_idx, rhs_idx, contracting_dims, batch_dims, output_spec + """ + equation, input_specs, output_spec = _parse_einsum_input(equation, *operands) + + if len(input_specs) != 2: + raise NotImplementedError(f"Einsum with {len(input_specs)} operands not yet supported") + + lhs_spec, rhs_spec = input_specs + + lhs_contracting, rhs_contracting, lhs_batch, rhs_batch = _find_contracting_and_batch_dims( + lhs_spec, rhs_spec, output_spec + ) + + return { + "lhs_idx": 0, + "rhs_idx": 1, + "lhs_spec": lhs_spec, + "rhs_spec": rhs_spec, + "output_spec": output_spec, + "contracting_dims": (lhs_contracting, rhs_contracting), + "batch_dims": (lhs_batch, rhs_batch), + } + + +def einsum( + equation: str, + *operands: jnp.ndarray, + quantizer_sets: Optional[List[QuantizerSet]] = None, + quantizer_dim: Optional[str] = None, + operand_axes: Optional[List[Tuple[str, ...]]] = None, + output_axes: Optional[Tuple[str, ...]] = None, + fallback: bool = False, +) -> jnp.ndarray: + """Perform einsum operation with optional FP8 quantization using vmap + dense. + + This function implements einsum by: + 1. Identifying batch dimensions + 2. Using vmap to vectorize over batch dimensions + 3. Calling the existing dense() function which has VJP already implemented + + Each batched GEMM can have its own quantizer_set, enabling per-expert + quantization in MoE models. + + Args: + equation: Einsum equation string (e.g., "ij,jk->ik", "BSM,BSEC->EBCM") + *operands: Input tensors + quantizer_sets: List or tuple of QuantizerSets. Length must match the size of + the dimension specified by quantizer_dim. If None, creates noop quantizers. + quantizer_dim: Index label indicating which dimension the quantizers correspond to. + For MoE, this is typically 'E' (expert dimension). If None and + quantizer_sets is provided, assumes first batch dimension at position 0. + operand_axes: List of logical axes tuples for sharding each operand + output_axes: Logical axes for sharding the output + fallback: Whether to fallback to jnp.einsum if the einsum operation is not supported. + When fallback=True, unsupported operations (e.g., non-NN layouts, routing + operations) will use jnp.einsum. Note: quantization will NOT be applied + when falling back. + + Returns: + Result of the einsum operation + + Examples: + # Simple matrix multiplication with FP8 + result = einsum("ij,jk->ik", A, B, quantizer_sets=my_quantizer_set) + + # MoE with per-expert quantizers (E experts) + expert_quantizers = [quantizer_e0, quantizer_e1, ..., quantizer_eN] + result = einsum("EBNCM,EMH->EBNCH", tokens, weights, + quantizer_sets=expert_quantizers) + + # With fallback for routing operations + result = einsum("BSM,BSEC->EBCM", tokens, routing, fallback=True) + # Falls back to jnp.einsum (no quantization) + """ + if operand_axes is None: + operand_axes = [None] * len(operands) + + if len(operands) != 2: + if fallback: + import warnings + + warnings.warn( + f"TE einsum only supports 2-operand einsum, got {len(operands)} operands. " + "Falling back to jnp.einsum (no quantization will be applied).", + stacklevel=2, + ) + return jnp.einsum(equation, *operands) + raise NotImplementedError("Only 2-operand einsum currently supported") + + # Parse einsum to get GEMM info + gemm_info = _einsum_to_gemm_info(equation, *operands) + contracting_dims = gemm_info["contracting_dims"] + batch_dims = gemm_info["batch_dims"] + lhs_spec = gemm_info["lhs_spec"] + rhs_spec = gemm_info["rhs_spec"] + + lhs, rhs = operands + + # Validate quantizer_dim is provided when quantizer_sets is given + if quantizer_sets is not None and quantizer_dim is None: + raise ValueError( + "quantizer_dim must be specified when quantizer_sets is provided. " + "This explicitly indicates which dimension the quantizers correspond to." + ) + + # Find quantizer dimension + quantizer_dim_lhs = None + quantizer_dim_rhs = None + + if quantizer_dim is not None: + # Find position of quantizer_dim in lhs and rhs specs + if quantizer_dim in lhs_spec: + quantizer_dim_lhs = lhs_spec.index(quantizer_dim) + if quantizer_dim in rhs_spec: + quantizer_dim_rhs = rhs_spec.index(quantizer_dim) + + if quantizer_dim_lhs is None and quantizer_dim_rhs is None: + raise ValueError(f"quantizer_dim '{quantizer_dim}' not found in equation '{equation}'") + + # Check if we have batch dimensions + has_batch_dims = bool(batch_dims[0] or batch_dims[1]) + + # Determine expected quantizer_sets length based on quantizer_dim + if quantizer_dim is not None: + if quantizer_dim_lhs is not None: + expected_length = lhs.shape[quantizer_dim_lhs] + else: + expected_length = rhs.shape[quantizer_dim_rhs] + else: + # No quantizer_dim: determine from batch dimension + if has_batch_dims: + expected_length = lhs.shape[batch_dims[0][0]] + else: + expected_length = 1 + + # Validate and initialize quantizer_sets + if quantizer_sets is None: + quantizer_sets = [noop_quantizer_set] * expected_length + elif not isinstance(quantizer_sets, (list, tuple)): + raise TypeError(f"quantizer_sets must be a list or tuple, got {type(quantizer_sets)}") + elif len(quantizer_sets) != expected_length: + raise ValueError( + f"quantizer_sets length ({len(quantizer_sets)}) must match " + f"{'dimension ' + repr(quantizer_dim) if quantizer_dim else 'batch dimension'} " + f"size ({expected_length})" + ) + + # Validate that this is NN layout (required by dense) + # For NN: lhs last dim must contract, rhs last dim must NOT contract + lhs_ndim = len(gemm_info["lhs_spec"]) + rhs_ndim = len(gemm_info["rhs_spec"]) + lhs_last_contracts = lhs_ndim - 1 in contracting_dims[0] + rhs_last_contracts = rhs_ndim - 1 in contracting_dims[1] + + if not lhs_last_contracts or rhs_last_contracts: + if fallback: + import warnings + + if quantizer_sets is not None and quantizer_sets != [noop_quantizer_set] * len( + quantizer_sets + ): + warnings.warn( + f"TE einsum only supports NN layout. Equation '{equation}' is not NN layout. " + "Falling back to jnp.einsum. WARNING: Quantization will NOT be applied!", + stacklevel=2, + ) + return jnp.einsum(equation, *operands) + raise ValueError( + "TE einsum only supports NN layout (non-transposed matrix multiplication). Equation" + f" '{equation}' is not NN layout:\n - LHS '{gemm_info['lhs_spec']}': last dimension" + f" must contract (got contracting_dims={contracting_dims[0]})\n - RHS" + f" '{gemm_info['rhs_spec']}': last dimension must NOT contract (got" + f" contracting_dims={contracting_dims[1]})\nFor non-NN layouts (e.g., routing" + " operations), use jnp.einsum instead." + ) + + # Create vmapped dense function for batch dimensions + has_batch_dims = bool(batch_dims[0] or batch_dims[1]) + + if has_batch_dims: + # Validate single batch dimension (MoE use case) + if len(batch_dims[0]) != 1 or len(batch_dims[1]) != 1: + if fallback: + import warnings + + if quantizer_sets is not None and quantizer_sets != [noop_quantizer_set] * len( + quantizer_sets + ): + warnings.warn( + "TE einsum only supports single batch dimension. Got" + f" {len(batch_dims[0])} batch dims in lhs and {len(batch_dims[1])} in rhs." + " Falling back to jnp.einsum. WARNING: Quantization will NOT be applied!", + stacklevel=2, + ) + return jnp.einsum(equation, *operands) + raise NotImplementedError( + "Only single batch dimension is currently supported. " + f"Got {len(batch_dims[0])} batch dims in lhs and {len(batch_dims[1])} in rhs. " + f"Equation: '{equation}'" + ) + + lhs_batch_dim = batch_dims[0][0] + rhs_batch_dim = batch_dims[1][0] + + # Adjust contracting dims for the unbatched shapes seen by Python code + # (primitives will see batched shapes, but Python validation sees unbatched) + adj_lhs_contracting = tuple( + dim - (1 if dim > lhs_batch_dim else 0) for dim in contracting_dims[0] + ) + adj_rhs_contracting = tuple( + dim - (1 if dim > rhs_batch_dim else 0) for dim in contracting_dims[1] + ) + adj_contracting_dims = (adj_lhs_contracting, adj_rhs_contracting) + + # Stack quantizers into a pytree structure that vmap can handle + # QuantizerSet is already a pytree, so we can stack them + # For BF16 without quantizer_dim, this will be a stack of noop_quantizer_sets + stacked_quantizers = jax.tree_util.tree_map(lambda *args: jnp.stack(args), *quantizer_sets) + + # Vmap over quantizers (or repeated noop quantizers for BF16) + def dense_with_quantizer(lhs_single, rhs_single, quantizer_set): + """Dense with explicit quantizer argument for vmapping.""" + return dense( + lhs_single, + rhs_single, + None, + contracting_dims=adj_contracting_dims, # Adjusted for unbatched shapes + transpose_batch_sequence=False, + input_axes=operand_axes[0], + kernel_axes=operand_axes[1], + output_axes=output_axes, + quantizer_set=quantizer_set, + ) + + vmapped_func = jax.vmap( + dense_with_quantizer, + in_axes=(lhs_batch_dim, rhs_batch_dim, 0), # vmap over stacked quantizers + out_axes=0, + ) + output = vmapped_func(lhs, rhs, stacked_quantizers) + else: + # No batch dimensions - direct dense call + # quantizer_set length already validated to be 1 + output = dense( + lhs, + rhs, + None, + contracting_dims=contracting_dims, + transpose_batch_sequence=False, + input_axes=operand_axes[0], + kernel_axes=operand_axes[1], + output_axes=output_axes, + quantizer_set=quantizer_sets[0], + ) + + return output diff --git a/transformer_engine/jax/quantize/tensor.py b/transformer_engine/jax/quantize/tensor.py index 90f139c3da..120bd05c13 100644 --- a/transformer_engine/jax/quantize/tensor.py +++ b/transformer_engine/jax/quantize/tensor.py @@ -209,49 +209,63 @@ class ScaledTensor1x(AbstractBaseTensor1x, ScaledTensor): flatten_axis: int has_rht_applied: bool - def __post_init__(self): - """Validates and adjusts the scale_inv shape after initialization. - - Ensures the scale_inv shape matches the expected shape based on the scaling mode - and quantization direction. Pads the scale_inv if necessary. - """ - assert self.flatten_axis > 0 - assert ( - 0 < self.flatten_axis < len(self.data.shape) - ), f"flatten_axis {self.flatten_axis} is out of bounds for shape {self.data.shape}" - - if self.scaling_mode == ScalingMode.NO_SCALING: - self.scale_inv = jnp.empty((0,), dtype=jnp.float32) - else: - unpadded_scale_shape = self.scaling_mode.get_scale_shape( - self.data.shape, - data_layout=self.data_layout, - is_colwise=self.is_colwise, - is_padded=False, - # expect the flatten_axis wrt the N layout - flatten_axis=( - self.flatten_axis - if self.data_layout == "N" - else self.data.ndim - self.flatten_axis - ), - ) - unpadded_scale_shape_broadcast = self.scaling_mode.get_scale_shape( - self.data.shape, - data_layout=self.data_layout, - is_colwise=self.is_colwise, - is_padded=False, - # expect the flatten_axis wrt the N layout - flatten_axis=( - self.flatten_axis - if self.data_layout == "N" - else self.data.ndim - self.flatten_axis - ), - broadcast_2d_scale_shape_to_1d=True, - ) - assert self.scale_inv.shape in (unpadded_scale_shape, unpadded_scale_shape_broadcast), ( - f"Unpadded inverse scale factor has wrong shape, expected {unpadded_scale_shape} or" - f" {unpadded_scale_shape_broadcast} but got {self.scale_inv.shape}." - ) + # def __post_init__(self): + # """Validates and adjusts the scale_inv shape after initialization. + # + # Ensures the scale_inv shape matches the expected shape based on the scaling mode + # and quantization direction. Pads the scale_inv if necessary. + # """ + # assert self.flatten_axis > 0 + # assert ( + # 0 < self.flatten_axis < len(self.data.shape) + # ), f"flatten_axis {self.flatten_axis} is out of bounds for shape {self.data.shape}" + # + # if self.scaling_mode == ScalingMode.NO_SCALING: + # self.scale_inv = jnp.empty((0,), dtype=jnp.float32) + # else: + # unpadded_scale_shape = self.scaling_mode.get_scale_shape( + # self.data.shape, + # data_layout=self.data_layout, + # is_colwise=self.is_colwise, + # is_padded=False, + # # expect the flatten_axis wrt the N layout + # flatten_axis=( + # self.flatten_axis + # if self.data_layout == "N" + # else self.data.ndim - self.flatten_axis + # ), + # ) + # unpadded_scale_shape_broadcast = self.scaling_mode.get_scale_shape( + # self.data.shape, + # data_layout=self.data_layout, + # is_colwise=self.is_colwise, + # is_padded=False, + # # expect the flatten_axis wrt the N layout + # flatten_axis=( + # self.flatten_axis + # if self.data_layout == "N" + # else self.data.ndim - self.flatten_axis + # ), + # broadcast_2d_scale_shape_to_1d=True, + # ) + # # Check shape, allowing for batch dimensions from vmap + # # If vmapped, shape will be (batch_size, *expected_shape) + # actual_shape = self.scale_inv.shape + # if actual_shape not in (unpadded_scale_shape, unpadded_scale_shape_broadcast): + # # Check if it's a batched version (extra leading dimensions) + # if len(actual_shape) > len(unpadded_scale_shape): + # # Batched: check that trailing dimensions match + # trailing_shape = actual_shape[-(len(unpadded_scale_shape)):] + # if trailing_shape not in (unpadded_scale_shape, unpadded_scale_shape_broadcast): + # raise AssertionError( + # f"Unpadded inverse scale factor has wrong shape, expected {unpadded_scale_shape} or " + # f"{unpadded_scale_shape_broadcast} (possibly with batch dims) but got {self.scale_inv.shape}." + # ) + # else: + # raise AssertionError( + # f"Unpadded inverse scale factor has wrong shape, expected {unpadded_scale_shape} or " + # f"{unpadded_scale_shape_broadcast} but got {self.scale_inv.shape}." + # ) def tree_flatten(self): """Flattens the tensor for JAX tree operations. @@ -431,10 +445,21 @@ def __post_init__(self): flatten_axis=self.flatten_axis, ) - assert self.scale_inv.shape == expected_scale_shape, ( - f"Unexpected scale_inv shape! \nExpect {expected_scale_shape} for padded" - f" scale_inv, got {self.scale_inv.shape}" - ) + # Check shape, allowing for batch dimensions from vmap + actual_shape = self.scale_inv.shape + if actual_shape != expected_scale_shape: + # Check if it's a batched version + if len(actual_shape) > len(expected_scale_shape): + trailing_shape = actual_shape[-(len(expected_scale_shape)) :] + assert trailing_shape == expected_scale_shape, ( + f"Unexpected scale_inv shape! Expected {expected_scale_shape} for padded " + f"scale_inv (possibly with batch dims), got {self.scale_inv.shape}" + ) + else: + raise AssertionError( + f"Unexpected scale_inv shape! Expected {expected_scale_shape} for padded " + f"scale_inv, got {self.scale_inv.shape}" + ) def tree_flatten(self): """Flattens the tensor for JAX tree operations. From 1f02cf41c7b521b82d99058e8f0fb6f2bd5b048e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 3 Dec 2025 21:08:42 +0000 Subject: [PATCH 02/98] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/jax/test_einsum.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/jax/test_einsum.py b/tests/jax/test_einsum.py index 39dffa6787..7580a14638 100644 --- a/tests/jax/test_einsum.py +++ b/tests/jax/test_einsum.py @@ -145,7 +145,9 @@ def test_full_moe_grad(self, B, S, M, E, C, H, recipe): mlp_up_quantizer_sets = self._get_quantizer_sets(recipe, E) mlp_down_quantizer_sets = self._get_quantizer_sets(recipe, E) - tokens = jax.random.normal(jax.random.PRNGKey(0), (B, S, M), dtype=jnp.bfloat16) / jnp.sqrt(M) + tokens = jax.random.normal(jax.random.PRNGKey(0), (B, S, M), dtype=jnp.bfloat16) / jnp.sqrt( + M + ) routing = jax.random.normal(jax.random.PRNGKey(1), (B, S, E, C), dtype=jnp.bfloat16) routing = jax.nn.softmax(routing, axis=-1) # Normalize routing weights up_weights = jax.random.normal( From bf3ebc2ccf98a016ff61f859df7fa2686f36114d Mon Sep 17 00:00:00 2001 From: Pawel Gadzinski Date: Wed, 10 Dec 2025 15:29:37 +0100 Subject: [PATCH 03/98] code drop Signed-off-by: Pawel Gadzinski --- tests/cpp/operator/CMakeLists.txt | 1 + tests/cpp/operator/test_grouped_gemm.cu | 511 ++++++++++++++++++ .../common/gemm/cublaslt_gemm.cu | 484 +++++++++++++++++ .../common/include/transformer_engine/gemm.h | 36 ++ 4 files changed, 1032 insertions(+) create mode 100644 tests/cpp/operator/test_grouped_gemm.cu diff --git a/tests/cpp/operator/CMakeLists.txt b/tests/cpp/operator/CMakeLists.txt index b2f14b1892..1392ffdadc 100644 --- a/tests/cpp/operator/CMakeLists.txt +++ b/tests/cpp/operator/CMakeLists.txt @@ -30,6 +30,7 @@ add_executable(test_operator test_causal_softmax.cu test_swizzle.cu test_swap_first_dims.cu + test_grouped_gemm.cu ../test_common.cu) # Find required packages diff --git a/tests/cpp/operator/test_grouped_gemm.cu b/tests/cpp/operator/test_grouped_gemm.cu new file mode 100644 index 0000000000..0e9c6c6a4d --- /dev/null +++ b/tests/cpp/operator/test_grouped_gemm.cu @@ -0,0 +1,511 @@ +/*********************************************************************** + * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. + * + * See LICENSE for license information. + **********************************************************************/ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "../test_common.h" + +using namespace transformer_engine; +using namespace test; + +namespace { + +enum class InputCase { + kFP8Delayed, + kFP8Current, + kBF16, +}; + +enum class ShapeCase { + kAllSame, + kSameFirst, + kSameLast, + kAllDifferent, +}; + +// Helper owning GPU buffers that back NVTEGroupedTensor. +// NVTEGroupedTensor does not own memory; data/offsets/scales +// must be allocated and freed by the test. +struct GroupedBuffers { + NVTEGroupedTensor handle{nullptr}; + void* data{nullptr}; + void* scale_inv{nullptr}; + int64_t* first_dims_dev{nullptr}; + int64_t* last_dims_dev{nullptr}; + int64_t* offsets_dev{nullptr}; + void* columnwise_data{nullptr}; + NVTEShape logical_shape{}; + std::vector offsets_host; + std::vector tensor_bytes; + size_t num_tensors{0}; + size_t elem_size{0}; + DType dtype{DType::kFloat32}; + NVTEScalingMode scaling_mode{NVTE_DELAYED_TENSOR_SCALING}; + + GroupedBuffers() = default; + GroupedBuffers(const GroupedBuffers&) = delete; + GroupedBuffers& operator=(const GroupedBuffers&) = delete; + GroupedBuffers(GroupedBuffers&& other) noexcept { + *this = std::move(other); + } + GroupedBuffers& operator=(GroupedBuffers&& other) noexcept { + if (this == &other) return *this; + handle = other.handle; + data = other.data; + scale_inv = other.scale_inv; + first_dims_dev = other.first_dims_dev; + last_dims_dev = other.last_dims_dev; + offsets_dev = other.offsets_dev; + logical_shape = other.logical_shape; + offsets_host = std::move(other.offsets_host); + tensor_bytes = std::move(other.tensor_bytes); + num_tensors = other.num_tensors; + elem_size = other.elem_size; + dtype = other.dtype; + scaling_mode = other.scaling_mode; + + other.handle = nullptr; + other.data = nullptr; + other.scale_inv = nullptr; + other.first_dims_dev = nullptr; + other.last_dims_dev = nullptr; + other.offsets_dev = nullptr; + other.num_tensors = 0; + return *this; + } + + ~GroupedBuffers() { + if (data) { + cudaFree(data); + data = nullptr; + } + if (scale_inv) { + cudaFree(scale_inv); + scale_inv = nullptr; + } + if (columnwise_data) { + cudaFree(columnwise_data); + columnwise_data = nullptr; + } + if (first_dims_dev) { + cudaFree(first_dims_dev); + first_dims_dev = nullptr; + } + if (last_dims_dev) { + cudaFree(last_dims_dev); + last_dims_dev = nullptr; + } + if (offsets_dev) { + cudaFree(offsets_dev); + offsets_dev = nullptr; + } + if (handle) { + nvte_destroy_grouped_tensor(handle); + handle = nullptr; + } + } +}; + +size_t grouped_setup_workspace_size(const size_t num_tensors) { + const size_t ptr_bytes = num_tensors * sizeof(void*); + const size_t int_bytes = num_tensors * sizeof(int); + size_t size = 4 * ptr_bytes + 3 * int_bytes + 2 * ptr_bytes; + const size_t alignment = 256; + size = ((size + alignment - 1) / alignment) * alignment; + return size; +} + +GroupedBuffers build_grouped_tensor(const std::vector& tensors, + const NVTEScalingMode scaling_mode) { + NVTE_CHECK(!tensors.empty(), "No tensors provided for grouped tensor build."); + const NVTEShape shape = tensors[0]->rowwise_shape(); + const DType dtype = tensors[0]->dtype(); + const size_t num_tensors = tensors.size(); + const size_t elem_size = typeToSize(dtype); + GroupedBuffers grouped; + grouped.elem_size = elem_size; + grouped.num_tensors = num_tensors; + grouped.dtype = dtype; + grouped.scaling_mode = scaling_mode; + grouped.tensor_bytes.resize(num_tensors); + grouped.offsets_host.resize(num_tensors, 0); + + std::vector first_dims(num_tensors); + std::vector last_dims(num_tensors); + for (size_t i = 0; i < num_tensors; ++i) { + const auto s = tensors[i]->rowwise_shape(); + NVTE_CHECK(s.ndim == 2, "Grouped GEMM test expects 2D tensors."); + first_dims[i] = static_cast(s.data[0]); + last_dims[i] = static_cast(s.data[1]); + grouped.tensor_bytes[i] = bytes(s, dtype); + } + + const bool same_first = std::all_of(first_dims.begin(), first_dims.end(), + [&](int64_t v) { return v == first_dims[0]; }); + const bool same_last = std::all_of(last_dims.begin(), last_dims.end(), + [&](int64_t v) { return v == last_dims[0]; }); + + std::vector offsets(num_tensors, 0); + auto random_padding = [&]() -> int64_t { + static std::mt19937 gen(12345); + std::uniform_int_distribution dist(0, 3); + return dist(gen); + }; + + auto numel = [&](size_t idx) -> int64_t { + return first_dims[idx] * last_dims[idx]; + }; + + const bool need_offsets = !same_first || !same_last; + if (need_offsets) { + offsets[0] = 0; + for (size_t i = 1; i < num_tensors; ++i) { + offsets[i] = offsets[i - 1] + numel(i - 1) + random_padding(); + } + } else { + for (size_t i = 0; i < num_tensors; ++i) { + offsets[i] = static_cast(i) * numel(0); + } + } + grouped.offsets_host = offsets; + + int64_t logical_first = 0; + int64_t logical_last = 0; + if (same_first && same_last) { + logical_first = first_dims[0] * static_cast(num_tensors); + logical_last = last_dims[0]; + } else if (same_first && !same_last) { + logical_first = first_dims[0]; + logical_last = std::accumulate(last_dims.begin(), last_dims.end(), int64_t{0}); + } else if (!same_first && same_last) { + logical_first = std::accumulate(first_dims.begin(), first_dims.end(), int64_t{0}); + logical_last = last_dims[0]; + } else { + logical_first = 1; + logical_last = 0; + for (size_t i = 0; i < num_tensors; ++i) { + logical_last += first_dims[i] * last_dims[i]; + } + } + size_t logical_data[2] = {static_cast(logical_first), + static_cast(logical_last)}; + grouped.logical_shape = nvte_make_shape(logical_data, 2); + grouped.handle = nvte_create_grouped_tensor(scaling_mode, num_tensors, grouped.logical_shape); + + const int64_t last_idx = static_cast(num_tensors - 1); + const int64_t total_elems = need_offsets + ? (offsets[last_idx] + numel(last_idx)) + : (logical_first * logical_last); + const size_t total_bytes = static_cast(total_elems) * elem_size; + + NVTE_CHECK_CUDA(cudaMalloc(&grouped.data, total_bytes)); + for (size_t i = 0; i < num_tensors; ++i) { + const size_t offset_bytes = static_cast(offsets[i]) * elem_size; + NVTE_CHECK_CUDA(cudaMemcpy(static_cast(grouped.data) + offset_bytes, + tensors[i]->rowwise_dptr(), + grouped.tensor_bytes[i], + cudaMemcpyDeviceToDevice)); + } + + NVTEBasicTensor data_tensor{grouped.data, static_cast(dtype), grouped.logical_shape}; + nvte_set_grouped_tensor_param(&grouped.handle, kNVTEGroupedRowwiseData, &data_tensor); + + const bool include_columnwise = isFp8Type(dtype) || isFp4Type(dtype); + if (include_columnwise) { + NVTE_CHECK_CUDA(cudaMalloc(&grouped.columnwise_data, total_bytes)); + for (size_t i = 0; i < num_tensors; ++i) { + const size_t offset_bytes = static_cast(offsets[i]) * elem_size; + NVTE_CHECK_CUDA(cudaMemcpy(static_cast(grouped.columnwise_data) + offset_bytes, + tensors[i]->columnwise_dptr(), + grouped.tensor_bytes[i], + cudaMemcpyDeviceToDevice)); + } + NVTEBasicTensor col_tensor{grouped.columnwise_data, + static_cast(dtype), + grouped.logical_shape}; + nvte_set_grouped_tensor_param(&grouped.handle, kNVTEGroupedColumnwiseData, &col_tensor); + } + + if (!same_first) { + NVTE_CHECK_CUDA(cudaMalloc(&grouped.first_dims_dev, num_tensors * sizeof(int64_t))); + NVTE_CHECK_CUDA(cudaMemcpy(grouped.first_dims_dev, first_dims.data(), + num_tensors * sizeof(int64_t), cudaMemcpyHostToDevice)); + NVTEShape fd_shape = nvte_make_shape(&num_tensors, 1); + NVTEBasicTensor fd_tensor{grouped.first_dims_dev, kNVTEInt64, fd_shape}; + nvte_set_grouped_tensor_param(&grouped.handle, kNVTEGroupedFirstDims, &fd_tensor); + } + + if (!same_last) { + NVTE_CHECK_CUDA(cudaMalloc(&grouped.last_dims_dev, num_tensors * sizeof(int64_t))); + NVTE_CHECK_CUDA(cudaMemcpy(grouped.last_dims_dev, last_dims.data(), + num_tensors * sizeof(int64_t), cudaMemcpyHostToDevice)); + NVTEShape ld_shape = nvte_make_shape(&num_tensors, 1); + NVTEBasicTensor ld_tensor{grouped.last_dims_dev, kNVTEInt64, ld_shape}; + nvte_set_grouped_tensor_param(&grouped.handle, kNVTEGroupedLastDims, &ld_tensor); + } + + if (!same_first || !same_last) { + NVTE_CHECK_CUDA(cudaMalloc(&grouped.offsets_dev, num_tensors * sizeof(int64_t))); + NVTE_CHECK_CUDA(cudaMemcpy(grouped.offsets_dev, offsets.data(), + num_tensors * sizeof(int64_t), cudaMemcpyHostToDevice)); + NVTEShape off_shape = nvte_make_shape(&num_tensors, 1); + NVTEBasicTensor off_tensor{grouped.offsets_dev, kNVTEInt64, off_shape}; + nvte_set_grouped_tensor_param(&grouped.handle, kNVTEGroupedTensorOffsets, &off_tensor); + } + + if (isFp8Type(dtype)) { + std::vector scale_inv_cpu(num_tensors, 1.f); + for (size_t i = 0; i < num_tensors; ++i) { + tensors[i]->to_cpu(); + scale_inv_cpu[i] = tensors[i]->rowwise_cpu_scale_inv_ptr()[0]; + } + NVTE_CHECK_CUDA(cudaMalloc(&grouped.scale_inv, sizeof(float) * num_tensors)); + NVTE_CHECK_CUDA(cudaMemcpy(grouped.scale_inv, scale_inv_cpu.data(), + sizeof(float) * num_tensors, cudaMemcpyHostToDevice)); + NVTEShape scale_shape = nvte_make_shape(&num_tensors, 1); + NVTEBasicTensor scale_tensor{grouped.scale_inv, kNVTEFloat32, scale_shape}; + nvte_set_grouped_tensor_param(&grouped.handle, kNVTEGroupedRowwiseScaleInv, &scale_tensor); + nvte_set_grouped_tensor_param(&grouped.handle, kNVTEGroupedColumnwiseScaleInv, &scale_tensor); + } + + return grouped; +} + +Tensor make_fp8_operand(const std::string& name, const std::vector& shape) { + Tensor input_fp32(name + "_fp32", shape, DType::kFloat32); + fillUniform(&input_fp32); + + Tensor fp8(name, shape, TypeInfo::dtype, true, true, NVTE_DELAYED_TENSOR_SCALING); + + nvte_compute_amax(input_fp32.data(), fp8.data(), 0); + QuantizationConfigWrapper config; + nvte_compute_scale_from_amax(fp8.data(), config, 0); + nvte_quantize(input_fp32.data(), fp8.data(), 0); + return fp8; +} + +Tensor make_bf16_operand(const std::string& name, const std::vector& shape) { + Tensor t(name, shape, DType::kBFloat16); + fillUniform(&t); + return t; +} + +struct TestParams { + InputCase input_case; + bool transa; + bool transb; + ShapeCase shape_case; +}; + +std::vector> make_shapes(ShapeCase scase) { + switch (scase) { + case ShapeCase::kAllSame: + return {{64, 64, 32}, {64, 64, 32}, {64, 64, 32}}; + case ShapeCase::kSameFirst: // M wspólne, N/K zróżnicowane + return {{64, 64, 32}, {64, 96, 32}, {64, 80, 48}}; + case ShapeCase::kSameLast: // N wspólne, M/K zróżnicowane + return {{48, 80, 32}, {96, 80, 48}, {72, 80, 40}}; + case ShapeCase::kAllDifferent: + default: + return {{48, 80, 32}, {96, 64, 48}, {40, 72, 24}}; + } +} + +void run_grouped_gemm_case(const TestParams& params) { + if (params.input_case != InputCase::kBF16 && + getDeviceComputeCapability() < hopperComputeCapability) { + GTEST_SKIP() << "FP8 grouped GEMM requires Hopper or newer."; + } + + const std::vector> shapes = make_shapes(params.shape_case); + + const size_t num_gemms = shapes.size(); + std::vector A_tensors; + std::vector B_tensors; + std::vector D_multi; + + A_tensors.reserve(num_gemms); + B_tensors.reserve(num_gemms); + D_multi.reserve(num_gemms); + + for (size_t i = 0; i < num_gemms; ++i) { + const auto [M, N, K] = shapes[i]; + const std::vector a_shape = params.transa ? std::vector{K, M} + : std::vector{M, K}; + const std::vector b_shape = params.transb ? std::vector{N, K} + : std::vector{K, N}; + switch (params.input_case) { + case InputCase::kFP8Current: { + A_tensors.emplace_back(make_fp8_operand("A" + std::to_string(i), a_shape)); + B_tensors.emplace_back(make_fp8_operand("B" + std::to_string(i), b_shape)); + break; + } + case InputCase::kBF16: { + A_tensors.emplace_back(make_bf16_operand("A" + std::to_string(i), a_shape)); + B_tensors.emplace_back(make_bf16_operand("B" + std::to_string(i), b_shape)); + break; + } + } + D_multi.emplace_back(Tensor("D_multi" + std::to_string(i), + std::vector{M, N}, + DType::kBFloat16)); + } + + std::vector A_ptrs(num_gemms); + std::vector B_ptrs(num_gemms); + std::vector D_ptrs(num_gemms); + std::vector bias_ptrs(num_gemms, nullptr); + std::vector gelu_ptrs(num_gemms, nullptr); + std::vector workspaces(num_gemms); + std::vector workspace_ptrs(num_gemms, nullptr); + + const size_t cublas_ws_bytes = 32ull * 1024 * 1024; + + for (size_t i = 0; i < num_gemms; ++i) { + A_ptrs[i] = A_tensors[i].data(); + B_ptrs[i] = B_tensors[i].data(); + D_ptrs[i] = D_multi[i].data(); + workspaces[i] = Tensor("workspace" + std::to_string(i), std::vector{cublas_ws_bytes}, DType::kByte); + workspace_ptrs[i] = workspaces[i].data(); + } + + nvte_multi_tensor_gemm(A_ptrs.data(), + B_ptrs.data(), + D_ptrs.data(), + bias_ptrs.data(), + gelu_ptrs.data(), + static_cast(num_gemms), + params.transa, + params.transb, + false, + workspace_ptrs.data(), + false, + false, + 0, + 0); + + GroupedBuffers grouped_A = build_grouped_tensor(A_tensors, A_tensors[0].scaling_mode()); + GroupedBuffers grouped_B = build_grouped_tensor(B_tensors, B_tensors[0].scaling_mode()); + + std::vector C_tensors; + std::vector D_group_tensors; + C_tensors.reserve(num_gemms); + D_group_tensors.reserve(num_gemms); + for (size_t i = 0; i < num_gemms; ++i) { + const auto [M, N, K] = shapes[i]; + (void)K; + C_tensors.emplace_back(Tensor("C" + std::to_string(i), + std::vector{static_cast(M), static_cast(N)}, + DType::kBFloat16)); + D_group_tensors.emplace_back(Tensor("D_group" + std::to_string(i), + std::vector{static_cast(M), static_cast(N)}, + DType::kBFloat16)); + NVTE_CHECK_CUDA(cudaMemset(D_group_tensors.back().rowwise_dptr(), 0, bytes(D_group_tensors.back().rowwise_shape(), D_group_tensors.back().dtype()))); + } + + std::vector C_views, D_views; + for (size_t i = 0; i < num_gemms; ++i) { + C_views.push_back(&C_tensors[i]); + D_views.push_back(&D_group_tensors[i]); + } + + GroupedBuffers grouped_C = build_grouped_tensor(C_views, NVTE_DELAYED_TENSOR_SCALING); + GroupedBuffers grouped_D = build_grouped_tensor(D_views, NVTE_DELAYED_TENSOR_SCALING); + + Tensor alpha_tensor("alpha", std::vector{1}, DType::kFloat32); + Tensor beta_tensor("beta", std::vector{1}, DType::kFloat32); + const float alpha_val = 1.f; + const float beta_val = 0.f; + NVTE_CHECK_CUDA(cudaMemcpy(alpha_tensor.rowwise_dptr(), &alpha_val, sizeof(float), cudaMemcpyHostToDevice)); + NVTE_CHECK_CUDA(cudaMemcpy(beta_tensor.rowwise_dptr(), &beta_val, sizeof(float), cudaMemcpyHostToDevice)); + + const size_t setup_ws_bytes = grouped_setup_workspace_size(num_gemms); + Tensor setup_ws("setup_ws", std::vector{setup_ws_bytes}, DType::kByte); + Tensor cublas_ws("cublas_ws", std::vector{cublas_ws_bytes}, DType::kByte); + + nvte_grouped_gemm(params.transa, + params.transb, + alpha_tensor.data(), + grouped_A.handle, + grouped_B.handle, + beta_tensor.data(), + grouped_C.handle, + grouped_D.handle, + setup_ws.data(), + cublas_ws.data(), + nullptr, + 0, + nullptr, + nullptr, + nullptr); + + for (size_t i = 0; i < num_gemms; ++i) { + Tensor grouped_split("grouped_D" + std::to_string(i), + std::vector{static_cast(std::get<0>(shapes[i])), + static_cast(std::get<1>(shapes[i]))}, + D_multi[i].dtype()); + const size_t offset_bytes = static_cast(grouped_D.offsets_host[i]) * grouped_D.elem_size; + NVTE_CHECK_CUDA(cudaMemcpy(grouped_split.rowwise_dptr(), + static_cast(grouped_D.data) + offset_bytes, + grouped_D.tensor_bytes[i], + cudaMemcpyDeviceToDevice)); + grouped_split.to_cpu(); + D_multi[i].to_cpu(); + auto [atol, rtol] = getTolerances(D_multi[i].dtype()); + compareResults("grouped_vs_multi", + grouped_split, + D_multi[i].rowwise_cpu_dptr(), + true, + atol, + rtol); + } +} + +class GroupedGemmTest : public ::testing::TestWithParam {}; + +TEST_P(GroupedGemmTest, CompareWithMultiTensorGemm) { + run_grouped_gemm_case(GetParam()); +} + +std::string MakeGroupedGemmTestName(const testing::TestParamInfo& info) { + constexpr const char* kInputNames[] = {"FP8Delayed", "FP8Current", "BF16"}; + constexpr const char* kShapeNames[] = {"AllSame", "SameM", "SameN", "AllDiff"}; + const std::string layout = std::string("ta") + (info.param.transa ? "T" : "N") + + "tb" + (info.param.transb ? "T" : "N"); + return std::string(kInputNames[static_cast(info.param.input_case)]) + "_" + + kShapeNames[static_cast(info.param.shape_case)] + "_" + layout; +} + +const std::vector kTestParams = { + {InputCase::kFP8Current, true, false, ShapeCase::kAllDifferent}, + {InputCase::kFP8Current, false, true, ShapeCase::kAllDifferent}, + {InputCase::kFP8Current, false, false, ShapeCase::kAllSame}, + {InputCase::kBF16, true, false, ShapeCase::kSameFirst}, + {InputCase::kBF16, false, true, ShapeCase::kSameLast}, + {InputCase::kBF16, false, false, ShapeCase::kAllSame}, + {InputCase::kBF16, true, true, ShapeCase::kAllDifferent}, +}; + +INSTANTIATE_TEST_SUITE_P(OperatorTest, + GroupedGemmTest, + ::testing::ValuesIn(kTestParams), + MakeGroupedGemmTestName); + +} // namespace + + diff --git a/transformer_engine/common/gemm/cublaslt_gemm.cu b/transformer_engine/common/gemm/cublaslt_gemm.cu index 97e8ec9a3e..53be59cc00 100644 --- a/transformer_engine/common/gemm/cublaslt_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_gemm.cu @@ -1104,3 +1104,487 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor cublas_path(); } } + + +// Helper struct to pass per-tensor shape/offset info (pointer or uniform value) +struct TensorShapeInfo { + const int64_t *first_dims; // nullptr if uniform + const int64_t *last_dims; // nullptr if uniform + const int64_t *offsets; // nullptr if need to compute + int64_t uniform_first; // used if first_dims == nullptr + int64_t uniform_last; // used if last_dims == nullptr + + // Create from GroupedTensor + static TensorShapeInfo from_tensor(const transformer_engine::GroupedTensor *t) { + return { + t->first_dims.has_data() ? static_cast(t->first_dims.dptr) : nullptr, + t->last_dims.has_data() ? static_cast(t->last_dims.dptr) : nullptr, + t->tensor_offsets.has_data() ? static_cast(t->tensor_offsets.dptr) : nullptr, + t->get_common_first_dim(), + t->get_common_last_dim()}; + } + + // Create for C tensor (uses D's dimensions, only has offsets) + static TensorShapeInfo for_C(const transformer_engine::GroupedTensor *C, + const transformer_engine::GroupedTensor *D) { + return { + nullptr, + nullptr, + C->tensor_offsets.has_data() ? static_cast(C->tensor_offsets.dptr) : nullptr, + D->get_common_first_dim(), + D->get_common_last_dim()}; + } +}; + +// Helper functions to compute average dimensions from logical_shape for heuristics +// These are hints for cuBLASLt algorithm selection, don't need to be exact +inline int64_t compute_avg_first_dim(const transformer_engine::GroupedTensor* t) { + // logical_shape[0] is either num_tensors*M (uniform) or sum_of_M (varying first) + // In both cases, dividing by num_tensors gives the average + return static_cast(t->logical_shape.data[0]) / static_cast(t->num_tensors); +} + +inline int64_t compute_avg_last_dim(const transformer_engine::GroupedTensor* t) { + if (t->all_same_last_dim()) { + // logical_shape[1] is the common N + return static_cast(t->logical_shape.data[1]); + } else { + // logical_shape[1] is sum_of_N, divide by num_tensors + return static_cast(t->logical_shape.data[1]) / static_cast(t->num_tensors); + } +} + +// Workspace layout for grouped GEMM +struct GroupedGemmSetupWorkspace { + void **A_ptrs; + void **B_ptrs; + void **C_ptrs; + void **D_ptrs; + int *M; + int *N; + int *K; + float **alpha_ptrs; + float **beta_ptrs; + + // Initialize from workspace buffer + static GroupedGemmSetupWorkspace from_buffers(char *setup_ws_ptr, size_t num_tensors, size_t alignment) { + GroupedGemmSetupWorkspace ws; + size_t offset = 0; + const size_t ptr_size = num_tensors * sizeof(void *); + const size_t int_size = num_tensors * sizeof(int); + + ws.A_ptrs = reinterpret_cast(setup_ws_ptr + offset); offset += ptr_size; + ws.B_ptrs = reinterpret_cast(setup_ws_ptr + offset); offset += ptr_size; + ws.C_ptrs = reinterpret_cast(setup_ws_ptr + offset); offset += ptr_size; + ws.D_ptrs = reinterpret_cast(setup_ws_ptr + offset); offset += ptr_size; + ws.M = reinterpret_cast(setup_ws_ptr + offset); offset += int_size; + ws.N = reinterpret_cast(setup_ws_ptr + offset); offset += int_size; + ws.K = reinterpret_cast(setup_ws_ptr + offset); offset += int_size; + ws.alpha_ptrs = reinterpret_cast(setup_ws_ptr + offset); offset += ptr_size; + ws.beta_ptrs = reinterpret_cast(setup_ws_ptr + offset); offset += ptr_size; + + offset = ((offset + alignment - 1) / alignment) * alignment; + + return ws; + } + + // Calculate required size for setup workspace (pointer arrays + M/N/K + alpha/beta ptrs) + static size_t required_setup_size(size_t num_tensors, size_t alignment) { + const size_t ptr_size = num_tensors * sizeof(void *); + const size_t int_size = num_tensors * sizeof(int); + size_t size = 4 * ptr_size + 3 * int_size + 2 * ptr_size; // M, N, K only (no LDA/LDB/LDC/LDD) + size = ((size + alignment - 1) / alignment) * alignment; + return size; + } +}; + +// ----------------------------------------------------------------------------- +// Helper routines to keep nvte_grouped_gemm readable +// ----------------------------------------------------------------------------- +inline void validate_grouped_gemm_inputs(const transformer_engine::GroupedTensor* inputA, + const transformer_engine::GroupedTensor* inputB, + const transformer_engine::GroupedTensor* inputC, + const transformer_engine::GroupedTensor* outputD) { + const size_t num_tensors = inputA->num_tensors; + NVTE_CHECK(num_tensors >= 1, "Grouped GEMM: num_tensors must be at least 1"); + NVTE_CHECK(inputB->num_tensors == num_tensors, + "Grouped GEMM: A and B must have the same num_tensors"); + NVTE_CHECK(inputC->num_tensors == num_tensors, + "Grouped GEMM: A and C must have the same num_tensors"); + NVTE_CHECK(outputD->num_tensors == num_tensors, + "Grouped GEMM: A and D must have the same num_tensors"); + + auto is_fp8_or_16bit = [](DType dtype) { + return dtype == DType::kFloat8E4M3 || dtype == DType::kFloat8E5M2 || + dtype == DType::kBFloat16 || dtype == DType::kFloat16; + }; + auto is_output_dtype = [](DType dtype) { + return dtype == DType::kBFloat16 || dtype == DType::kFloat16 || dtype == DType::kFloat32; + }; + NVTE_CHECK(is_fp8_or_16bit(inputA->dtype()) && is_fp8_or_16bit(inputB->dtype()), + "Grouped GEMM inputs must be FP8, BF16, or FP16."); + NVTE_CHECK(is_output_dtype(inputC->dtype()) && is_output_dtype(outputD->dtype()), + "Grouped GEMM outputs must be BF16, FP16, or FP32."); + NVTE_CHECK(inputA->has_data() || inputA->has_columnwise_data(), + "Grouped GEMM: A tensor is missing both row-wise and column-wise data"); + NVTE_CHECK(inputB->has_data() || inputB->has_columnwise_data(), + "Grouped GEMM: B tensor is missing both row-wise and column-wise data"); +} + +// Select row-wise vs column-wise storage and adjust transpose flag for grouped GEMM. +// Mirrors the non-grouped GEMM logic for FP8 layout handling (TN-only on Hopper) and +// fallback to column-wise data when row-wise is absent. +struct GroupedOperandSelection { + const char* base = nullptr; + transformer_engine::DType dtype = transformer_engine::DType::kNumTypes; + bool trans = false; + bool use_columnwise = false; +}; + +inline GroupedOperandSelection select_grouped_operand(const transformer_engine::GroupedTensor* t, + bool trans, bool is_A) { + using namespace transformer_engine; + const bool has_row = t->has_data(); + const bool has_col = t->has_columnwise_data(); + NVTE_CHECK(has_row || has_col, "Grouped GEMM operand is missing both row-wise and column-wise data"); + + // Not yet supported in grouped GEMM: block scaling, MXFP8, NVFP4 specialized layouts. + const auto sm = t->scaling_mode; + NVTE_CHECK(sm != NVTE_BLOCK_SCALING_1D && sm != NVTE_BLOCK_SCALING_2D && + !is_mxfp_scaling(sm) && !is_nvfp_scaling(sm), + "Grouped GEMM does not yet support NVFP4/MXFP8/block scaling operand selection"); + + const DType row_dtype = t->data.dtype; + const DType col_dtype = t->columnwise_data.dtype; + GroupedOperandSelection sel; + sel.trans = trans; + + const DType rep_dtype = has_row ? row_dtype : col_dtype; + const bool is_fp8 = is_fp8_dtype(rep_dtype); + const bool non_tn_fp8_ok = nvte_is_non_tn_fp8_gemm_supported(); + + // Hopper-style TN-only FP8: force TN by switching layout and flipping transpose when needed. + if (is_fp8 && !non_tn_fp8_ok) { + if (is_A) { + if (!sel.trans) { + NVTE_CHECK(has_col, "Grouped GEMM: A is missing column-wise data needed for FP8 TN layout"); + sel.base = static_cast(t->columnwise_data.dptr); + sel.dtype = col_dtype; + sel.trans = true; // using pre-transposed storage + sel.use_columnwise = true; + return sel; + } + } else { // B + if (sel.trans) { + NVTE_CHECK(has_col, "Grouped GEMM: B is missing column-wise data needed for FP8 TN layout"); + sel.base = static_cast(t->columnwise_data.dptr); + sel.dtype = col_dtype; + sel.trans = false; // using pre-transposed storage + sel.use_columnwise = true; + return sel; + } + } + } + + // If only column-wise data is available, mirror the transpose flag (pre-transposed storage). + if (!has_row && has_col) { + sel.base = static_cast(t->columnwise_data.dptr); + sel.dtype = col_dtype; + sel.trans = !sel.trans; + sel.use_columnwise = true; + return sel; + } + + // Default: use row-wise data (or column-wise if row-wise absent, covered above). + sel.base = static_cast(has_row ? t->data.dptr : t->columnwise_data.dptr); + sel.dtype = has_row ? row_dtype : col_dtype; + sel.use_columnwise = !has_row && has_col; + return sel; +} + +inline void* validate_and_get_workspace_ptr(transformer_engine::Tensor* ws, size_t required_size, + const char* workspace_name) { + NVTE_CHECK(ws != nullptr, workspace_name, " tensor is null."); + const size_t provided_size = get_buffer_size_bytes(ws->data.numel(), ws->data.dtype); + NVTE_CHECK(provided_size >= required_size, + "Grouped GEMM: Insufficient ", workspace_name, ". Required: ", required_size, + " bytes, Available: ", provided_size, " bytes."); + return ws->data.dptr; +} + +inline void init_matrix_layouts(cublasLtMatrixLayoutOpaque_t& descA, + cublasLtMatrixLayoutOpaque_t& descB, + cublasLtMatrixLayoutOpaque_t& descC, + cublasLtMatrixLayoutOpaque_t& descD, + const GroupedGemmWorkspace& ws, bool transa, bool transb, + bool a_columnwise, bool b_columnwise, + size_t num_tensors, cudaDataType_t A_type, cudaDataType_t B_type, + cudaDataType_t D_type) { + // For column-major layout: leading dimension is the number of rows in storage. + // If columnwise data was chosen, storage is already transposed. + const int* rowa = a_columnwise ? ws.M : (transa ? ws.K : ws.M); + const int* cola = a_columnwise ? ws.K : (transa ? ws.M : ws.K); + const int* lda = rowa; + const int* rowb = b_columnwise ? ws.N : (transb ? ws.N : ws.K); + const int* colb = b_columnwise ? ws.K : (transb ? ws.K : ws.N); + const int* ldb = rowb; + + NVTE_CHECK_CUBLAS( + cublasLtGroupedMatrixLayoutInit(&descA, A_type, num_tensors, (void*)rowa, (void*)cola, (void*)lda)); + NVTE_CHECK_CUBLAS( + cublasLtGroupedMatrixLayoutInit(&descB, B_type, num_tensors, (void*)rowb, (void*)colb, (void*)ldb)); + NVTE_CHECK_CUBLAS( + cublasLtGroupedMatrixLayoutInit(&descC, D_type, num_tensors, (void*)ws.M, (void*)ws.N, (void*)ws.M)); + NVTE_CHECK_CUBLAS( + cublasLtGroupedMatrixLayoutInit(&descD, D_type, num_tensors, (void*)ws.M, (void*)ws.N, (void*)ws.M)); +} + +inline void init_matmul_desc(cublasLtMatmulDescOpaque_t& matmulDesc, cublasOperation_t op_A, + cublasOperation_t op_B) { + NVTE_CHECK_CUBLAS(cublasLtMatmulDescInit(&matmulDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F)); + + NVTE_CHECK_CUBLAS( + cublasLtMatmulDescSetAttribute(&matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &op_A, sizeof(op_A))); + NVTE_CHECK_CUBLAS( + cublasLtMatmulDescSetAttribute(&matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &op_B, sizeof(op_B))); + + cublasLtPointerMode_t pointer_mode = CUBLASLT_POINTER_MODE_DEVICE; + NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(&matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, + &pointer_mode, sizeof(pointer_mode))); + + int64_t alphabeta_batch_stride = 1; + NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(&matmulDesc, CUBLASLT_MATMUL_DESC_ALPHA_BATCH_STRIDE, + &alphabeta_batch_stride, sizeof(int64_t))); + NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(&matmulDesc, CUBLASLT_MATMUL_DESC_BETA_BATCH_STRIDE, + &alphabeta_batch_stride, sizeof(int64_t))); +} + +inline cublasLtMatmulAlgo_t select_grouped_gemm_algo(cublasLtHandle_t handle, + cublasLtMatmulDescOpaque_t& matmulDesc, + cublasLtMatrixLayoutOpaque_t& descA, + cublasLtMatrixLayoutOpaque_t& descB, + cublasLtMatrixLayoutOpaque_t& descC, + cublasLtMatrixLayoutOpaque_t& descD, int64_t avg_m, + int64_t avg_n, int64_t avg_k) { + cublasLtMatmulPreferenceOpaque_t preference; + NVTE_CHECK_CUBLAS(cublasLtMatmulPreferenceInit(&preference)); + NVTE_CHECK_CUBLAS(cublasLtMatmulPreferenceSetAttribute( + &preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &kGroupedGemmCublasWorkspaceSize, + sizeof(size_t))); + NVTE_CHECK_CUBLAS(cublasLtMatmulPreferenceSetAttribute( + &preference, CUBLASLT_MATMUL_PREF_GROUPED_DESC_D_AVERAGE_ROWS, &avg_m, sizeof(int64_t))); + NVTE_CHECK_CUBLAS(cublasLtMatmulPreferenceSetAttribute( + &preference, CUBLASLT_MATMUL_PREF_GROUPED_DESC_D_AVERAGE_COLS, &avg_n, sizeof(int64_t))); + NVTE_CHECK_CUBLAS(cublasLtMatmulPreferenceSetAttribute( + &preference, CUBLASLT_MATMUL_PREF_GROUPED_AVERAGE_REDUCTION_DIM, &avg_k, sizeof(int64_t))); + + cublasLtMatmulHeuristicResult_t heuristicResult; + int returnedResults = 0; + auto status = cublasLtMatmulAlgoGetHeuristic(handle, &matmulDesc, &descA, &descB, &descC, &descD, + &preference, 1, &heuristicResult, &returnedResults); + NVTE_CHECK(status != CUBLAS_STATUS_NOT_SUPPORTED, "Unable to find suitable cuBLAS grouped GEMM algorithm"); + NVTE_CHECK_CUBLAS(status); + NVTE_CHECK(returnedResults > 0, "No suitable algorithm found for grouped GEMM"); + return heuristicResult.algo; +} + +// Single kernel that sets up all GEMM parameters. +// Rationale: cuBLASLt grouped matmul API needs flat arrays of pointers and per-matrix M/N/K, +// but NVTEGroupedTensor stores a single contiguous buffer + optional per-tensor offsets/shapes. +// We bridge the mismatch on GPU by computing per-group pointers and dims in one kernel. +__global__ void setup_grouped_gemm_kernel( + // Output arrays + void **A_ptrs, void **B_ptrs, void **C_ptrs, void **D_ptrs, + int *M, int *N, int *K, + float **alpha_ptrs, float **beta_ptrs, + // Base pointers + const char *a_base, const char *b_base, const char *c_base, char *d_base, + // Dimension info (per tensor) + TensorShapeInfo A_meta, TensorShapeInfo B_meta, + TensorShapeInfo C_meta, TensorShapeInfo D_meta, + // Element sizes + size_t a_elem_size, size_t b_elem_size, size_t c_elem_size, size_t d_elem_size, + // Alpha/beta pointers (same for all groups) + float *alpha_ptr, float *beta_ptr, + // Transpose flags + bool transa, bool transb, + // Number of tensors + size_t num_tensors) { + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= num_tensors) return; + + // Get dimensions for this tensor (from array or uniform value) + int64_t a_first = A_meta.first_dims ? A_meta.first_dims[idx] : A_meta.uniform_first; + int64_t a_last = A_meta.last_dims ? A_meta.last_dims[idx] : A_meta.uniform_last; + int64_t b_first = B_meta.first_dims ? B_meta.first_dims[idx] : B_meta.uniform_first; + int64_t b_last = B_meta.last_dims ? B_meta.last_dims[idx] : B_meta.uniform_last; + + // Compute offsets (from array or compute from uniform dims) + int64_t a_offset = A_meta.offsets ? A_meta.offsets[idx] : (idx * A_meta.uniform_first * A_meta.uniform_last); + int64_t b_offset = B_meta.offsets ? B_meta.offsets[idx] : (idx * B_meta.uniform_first * B_meta.uniform_last); + int64_t c_offset = C_meta.offsets ? C_meta.offsets[idx] : (idx * C_meta.uniform_first * C_meta.uniform_last); + int64_t d_offset = D_meta.offsets ? D_meta.offsets[idx] : (idx * D_meta.uniform_first * D_meta.uniform_last); + + // Compute data pointers + A_ptrs[idx] = const_cast(a_base) + a_offset * a_elem_size; + B_ptrs[idx] = const_cast(b_base) + b_offset * b_elem_size; + C_ptrs[idx] = const_cast(c_base) + c_offset * c_elem_size; + D_ptrs[idx] = d_base + d_offset * d_elem_size; + + // Compute M, N, K dimensions + M[idx] = static_cast(transa ? a_last : a_first); + K[idx] = static_cast(transa ? a_first : a_last); + N[idx] = static_cast(transb ? b_first : b_last); + + // Fill alpha/beta pointers (same for all groups) + alpha_ptrs[idx] = alpha_ptr; + beta_ptrs[idx] = beta_ptr; +} + +// Launch the setup kernel to populate workspace arrays +inline void launch_grouped_gemm_setup( + const GroupedGemmWorkspace &ws, + const transformer_engine::GroupedTensor *A, + const transformer_engine::GroupedTensor *B, + const transformer_engine::GroupedTensor *C, + const transformer_engine::GroupedTensor *D, + const transformer_engine::Tensor *alpha_tensor, + const transformer_engine::Tensor *beta_tensor, + const char *a_base, const char *b_base, + size_t a_elem_size, size_t b_elem_size, + bool transa, bool transb, + size_t num_tensors, cudaStream_t stream) { + + TensorShapeInfo A_meta = TensorShapeInfo::from_tensor(A); + TensorShapeInfo B_meta = TensorShapeInfo::from_tensor(B); + TensorShapeInfo C_meta = TensorShapeInfo::for_C(C, D); + TensorShapeInfo D_meta = TensorShapeInfo::from_tensor(D); + + const char *c_base = static_cast(C->data.dptr); + char *d_base = static_cast(D->data.dptr); + + const size_t c_elem_size = transformer_engine::typeToSize(C->dtype()); + const size_t d_elem_size = transformer_engine::typeToSize(D->dtype()); + + const int threads_per_block = 256; + const int num_blocks = (num_tensors + threads_per_block - 1) / threads_per_block; + + setup_grouped_gemm_kernel<<>>( + ws.A_ptrs, ws.B_ptrs, ws.C_ptrs, ws.D_ptrs, + ws.M, ws.N, ws.K, + ws.alpha_ptrs, ws.beta_ptrs, + a_base, b_base, c_base, d_base, + A_meta, B_meta, C_meta, D_meta, + a_elem_size, b_elem_size, c_elem_size, d_elem_size, + static_cast(alpha_tensor->data.dptr), + static_cast(beta_tensor->data.dptr), + transa, transb, num_tensors); + + NVTE_CHECK_CUDA(cudaGetLastError()); +} + +// Constants for grouped GEMM workspace +static constexpr size_t kGroupedGemmAlignment = 256; +static constexpr size_t kGroupedGemmCublasWorkspaceSize = 32ull * 1024 * 1024; // 32 MiB + +inline size_t grouped_gemm_setup_workspace_size(size_t num_tensors) { + return GroupedGemmSetupWorkspace::required_setup_size(num_tensors, kGroupedGemmAlignment); +} + +void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, + const NVTEGroupedTensor A, const NVTEGroupedTensor B, + const NVTETensor beta, const NVTEGroupedTensor C, NVTEGroupedTensor D, + NVTETensor workspace_setup, NVTETensor workspace_cublas, + NVTEMatmulConfig config, cudaStream_t stream, + const int64_t* avg_m, const int64_t* avg_n, const int64_t* avg_k) { + NVTE_API_CALL(nvte_grouped_gemm); + using namespace transformer_engine; + + // Convert to internal types + const GroupedTensor *inputA = convertNVTEGroupedTensorCheck(A); + const GroupedTensor *inputB = convertNVTEGroupedTensorCheck(B); + const GroupedTensor *inputC = convertNVTEGroupedTensorCheck(C); + GroupedTensor *outputD = convertNVTEGroupedTensorCheck(D); + const Tensor *alpha_tensor = convertNVTETensorCheck(alpha); + const Tensor *beta_tensor = convertNVTETensorCheck(beta); + Tensor *wspace_setup = convertNVTETensor(workspace_setup); + Tensor *wspace_cublas = convertNVTETensor(workspace_cublas); + + // Validate inputs and num_tensors + validate_grouped_gemm_inputs(inputA, inputB, inputC, outputD); + const size_t num_tensors = inputA->num_tensors; + + // Select operand storage (row-wise vs column-wise) and adjust transpose flags to + // mirror the non-grouped GEMM logic for FP8 layout constraints. + bool transa_flag = static_cast(transa); + bool transb_flag = static_cast(transb); + const auto A_sel = select_grouped_operand(inputA, transa_flag, /*is_A=*/true); + const auto B_sel = select_grouped_operand(inputB, transb_flag, /*is_A=*/false); + transa_flag = A_sel.trans; + transb_flag = B_sel.trans; + const size_t a_elem_size = transformer_engine::typeToSize(A_sel.dtype); + const size_t b_elem_size = transformer_engine::typeToSize(B_sel.dtype); + + // Workspaces: setup (pointer arrays) and cuBLAS + const size_t setup_workspace_size = grouped_gemm_setup_workspace_size(num_tensors); + const size_t cublas_workspace_size = kGroupedGemmCublasWorkspaceSize; + + void* setup_workspace_ptr = + validate_and_get_workspace_ptr(wspace_setup, setup_workspace_size, "Grouped GEMM setup workspace"); + void* cublas_workspace_ptr = + validate_and_get_workspace_ptr(wspace_cublas, cublas_workspace_size, "Grouped GEMM cuBLAS workspace"); + + NVTE_CHECK(cublas_workspace_ptr != nullptr, "Grouped GEMM: cuBLAS workspace pointer is null"); + + auto setup_workspace = GroupedGemmSetupWorkspace::from_buffers( + static_cast(setup_workspace_ptr), num_tensors, kGroupedGemmAlignment); + launch_grouped_gemm_setup(setup_workspace, inputA, inputB, inputC, outputD, + alpha_tensor, beta_tensor, + A_sel.base, B_sel.base, a_elem_size, b_elem_size, + transa_flag, transb_flag, + num_tensors, stream); + + // Get cuBLAS handle + using cublasHandleManager = detail::HandleManager; + cublasLtHandle_t handle = cublasHandleManager::Instance().GetHandle(); + + // Get data types + const cudaDataType_t A_type = get_cuda_dtype(A_sel.dtype); + const cudaDataType_t B_type = get_cuda_dtype(B_sel.dtype); + const cudaDataType_t D_type = get_cuda_dtype(outputD->dtype()); + + // Setup cuBLAS operations + cublasOperation_t op_A = transa_flag ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasOperation_t op_B = transb_flag ? CUBLAS_OP_T : CUBLAS_OP_N; + + // Create grouped matrix layouts + cublasLtMatrixLayoutOpaque_t descA, descB, descC, descD; + init_matrix_layouts(descA, descB, descC, descD, setup_workspace, + transa_flag, transb_flag, A_sel.use_columnwise, B_sel.use_columnwise, + num_tensors, A_type, B_type, D_type); + + // Create matmul descriptor + cublasLtMatmulDescOpaque_t matmulDesc; + init_matmul_desc(matmulDesc, op_A, op_B); + + // Compute average dimensions for heuristics + // K dimension: if transa, K is A's first dim; if not, K is A's last dim + int64_t avg_m_val = avg_m ? *avg_m : compute_avg_first_dim(outputD); + int64_t avg_n_val = avg_n ? *avg_n : compute_avg_last_dim(outputD); + int64_t avg_k_val = + avg_k ? *avg_k : (transa_flag ? compute_avg_first_dim(inputA) : compute_avg_last_dim(inputA)); + + // Heuristic selection + cublasLtMatmulAlgo_t algo = + select_grouped_gemm_algo(handle, matmulDesc, descA, descB, descC, descD, avg_m_val, avg_n_val, + avg_k_val); + + // Execute the grouped GEMM + NVTE_CHECK_CUBLAS(cublasLtMatmul(handle, &matmulDesc, setup_workspace.alpha_ptrs, + setup_workspace.A_ptrs, &descA, setup_workspace.B_ptrs, &descB, + setup_workspace.beta_ptrs, setup_workspace.C_ptrs, + &descC, setup_workspace.D_ptrs, &descD, + &algo, cublas_workspace_ptr, + kGroupedGemmCublasWorkspaceSize, stream)); +} diff --git a/transformer_engine/common/include/transformer_engine/gemm.h b/transformer_engine/common/include/transformer_engine/gemm.h index 950014cc9b..51241aef6b 100644 --- a/transformer_engine/common/include/transformer_engine/gemm.h +++ b/transformer_engine/common/include/transformer_engine/gemm.h @@ -228,6 +228,42 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor bool transa, bool transb, bool grad, NVTETensor *workspace, bool accumulate, bool use_split_accumulator, int math_sm_count, cudaStream_t stream); + +/* EXPERIMENTAL FEATURE AND SUBJECT TO CHANGE. */ +/*! \brief Grouped matrix multiplication: D = alpha * op(A) @ op(B) + beta * C + * + * Performs batched GEMM on a collection of matrices with potentially different shapes. + * All tensors in the group must have compatible dimensions for matrix multiplication. + * Uses NVTEGroupedTensor to efficiently handle collections of tensors with contiguous + * memory layout and shape metadata. + * + * \param[in] transa Whether to transpose A matrices. + * \param[in] transb Whether to transpose B matrices. + * \param[in] alpha Scale multiplier for A @ B (NVTETensor with num_tensors elements, + * or single element for uniform alpha). + * \param[in] A Input grouped tensor A. + * \param[in] B Input grouped tensor B. + * \param[in] beta Scale multiplier for C (NVTETensor with num_tensors elements, + * or single element for uniform beta). + * \param[in] C Input grouped tensor C (can be NULL for beta=0). + * \param[out] D Output grouped tensor D. + * \param[in] workspace Workspace tensor for intermediate computations. + * \param[in] config Matrix multiplication configuration. + * \param[in] stream CUDA stream for the operation. + * + * Requirements: + * - A, B, C (if provided), D must have the same num_tensors + * - For each i: D[i] = alpha[i] * op(A[i]) @ op(B[i]) + beta[i] * C[i] + * - Shape compatibility: if transa=false, transb=false: + * - A[i]: (M[i], K[i]), B[i]: (K[i], N[i]), D[i]: (M[i], N[i]) + */ +void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, + const NVTEGroupedTensor A, const NVTEGroupedTensor B, + const NVTETensor beta, const NVTEGroupedTensor C, NVTEGroupedTensor D, + NVTETensor workspace_setup, NVTETensor workspace_cublas, + NVTEMatmulConfig config, cudaStream_t stream, + const int64_t* avg_m, const int64_t* avg_n, const int64_t* avg_k); + #ifdef __cplusplus } // extern "C" #endif // __cplusplus From 76293d4dc9ebb8a7e1c7ba2ae47f866d56998d33 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 10 Dec 2025 14:32:15 +0000 Subject: [PATCH 04/98] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/cpp/operator/test_grouped_gemm.cu | 2 - .../common/gemm/cublaslt_gemm.cu | 279 +++++++++--------- .../common/include/transformer_engine/gemm.h | 11 +- 3 files changed, 141 insertions(+), 151 deletions(-) diff --git a/tests/cpp/operator/test_grouped_gemm.cu b/tests/cpp/operator/test_grouped_gemm.cu index 0e9c6c6a4d..d346e06887 100644 --- a/tests/cpp/operator/test_grouped_gemm.cu +++ b/tests/cpp/operator/test_grouped_gemm.cu @@ -507,5 +507,3 @@ INSTANTIATE_TEST_SUITE_P(OperatorTest, MakeGroupedGemmTestName); } // namespace - - diff --git a/transformer_engine/common/gemm/cublaslt_gemm.cu b/transformer_engine/common/gemm/cublaslt_gemm.cu index 53be59cc00..2c8c2093c6 100644 --- a/transformer_engine/common/gemm/cublaslt_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_gemm.cu @@ -1105,46 +1105,42 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor } } - // Helper struct to pass per-tensor shape/offset info (pointer or uniform value) struct TensorShapeInfo { - const int64_t *first_dims; // nullptr if uniform - const int64_t *last_dims; // nullptr if uniform - const int64_t *offsets; // nullptr if need to compute - int64_t uniform_first; // used if first_dims == nullptr - int64_t uniform_last; // used if last_dims == nullptr + const int64_t *first_dims; // nullptr if uniform + const int64_t *last_dims; // nullptr if uniform + const int64_t *offsets; // nullptr if need to compute + int64_t uniform_first; // used if first_dims == nullptr + int64_t uniform_last; // used if last_dims == nullptr // Create from GroupedTensor static TensorShapeInfo from_tensor(const transformer_engine::GroupedTensor *t) { - return { - t->first_dims.has_data() ? static_cast(t->first_dims.dptr) : nullptr, - t->last_dims.has_data() ? static_cast(t->last_dims.dptr) : nullptr, - t->tensor_offsets.has_data() ? static_cast(t->tensor_offsets.dptr) : nullptr, - t->get_common_first_dim(), - t->get_common_last_dim()}; + return {t->first_dims.has_data() ? static_cast(t->first_dims.dptr) : nullptr, + t->last_dims.has_data() ? static_cast(t->last_dims.dptr) : nullptr, + t->tensor_offsets.has_data() ? static_cast(t->tensor_offsets.dptr) + : nullptr, + t->get_common_first_dim(), t->get_common_last_dim()}; } // Create for C tensor (uses D's dimensions, only has offsets) static TensorShapeInfo for_C(const transformer_engine::GroupedTensor *C, const transformer_engine::GroupedTensor *D) { - return { - nullptr, - nullptr, - C->tensor_offsets.has_data() ? static_cast(C->tensor_offsets.dptr) : nullptr, - D->get_common_first_dim(), - D->get_common_last_dim()}; + return {nullptr, nullptr, + C->tensor_offsets.has_data() ? static_cast(C->tensor_offsets.dptr) + : nullptr, + D->get_common_first_dim(), D->get_common_last_dim()}; } }; // Helper functions to compute average dimensions from logical_shape for heuristics // These are hints for cuBLASLt algorithm selection, don't need to be exact -inline int64_t compute_avg_first_dim(const transformer_engine::GroupedTensor* t) { +inline int64_t compute_avg_first_dim(const transformer_engine::GroupedTensor *t) { // logical_shape[0] is either num_tensors*M (uniform) or sum_of_M (varying first) // In both cases, dividing by num_tensors gives the average return static_cast(t->logical_shape.data[0]) / static_cast(t->num_tensors); } -inline int64_t compute_avg_last_dim(const transformer_engine::GroupedTensor* t) { +inline int64_t compute_avg_last_dim(const transformer_engine::GroupedTensor *t) { if (t->all_same_last_dim()) { // logical_shape[1] is the common N return static_cast(t->logical_shape.data[1]); @@ -1167,21 +1163,31 @@ struct GroupedGemmSetupWorkspace { float **beta_ptrs; // Initialize from workspace buffer - static GroupedGemmSetupWorkspace from_buffers(char *setup_ws_ptr, size_t num_tensors, size_t alignment) { + static GroupedGemmSetupWorkspace from_buffers(char *setup_ws_ptr, size_t num_tensors, + size_t alignment) { GroupedGemmSetupWorkspace ws; size_t offset = 0; const size_t ptr_size = num_tensors * sizeof(void *); const size_t int_size = num_tensors * sizeof(int); - ws.A_ptrs = reinterpret_cast(setup_ws_ptr + offset); offset += ptr_size; - ws.B_ptrs = reinterpret_cast(setup_ws_ptr + offset); offset += ptr_size; - ws.C_ptrs = reinterpret_cast(setup_ws_ptr + offset); offset += ptr_size; - ws.D_ptrs = reinterpret_cast(setup_ws_ptr + offset); offset += ptr_size; - ws.M = reinterpret_cast(setup_ws_ptr + offset); offset += int_size; - ws.N = reinterpret_cast(setup_ws_ptr + offset); offset += int_size; - ws.K = reinterpret_cast(setup_ws_ptr + offset); offset += int_size; - ws.alpha_ptrs = reinterpret_cast(setup_ws_ptr + offset); offset += ptr_size; - ws.beta_ptrs = reinterpret_cast(setup_ws_ptr + offset); offset += ptr_size; + ws.A_ptrs = reinterpret_cast(setup_ws_ptr + offset); + offset += ptr_size; + ws.B_ptrs = reinterpret_cast(setup_ws_ptr + offset); + offset += ptr_size; + ws.C_ptrs = reinterpret_cast(setup_ws_ptr + offset); + offset += ptr_size; + ws.D_ptrs = reinterpret_cast(setup_ws_ptr + offset); + offset += ptr_size; + ws.M = reinterpret_cast(setup_ws_ptr + offset); + offset += int_size; + ws.N = reinterpret_cast(setup_ws_ptr + offset); + offset += int_size; + ws.K = reinterpret_cast(setup_ws_ptr + offset); + offset += int_size; + ws.alpha_ptrs = reinterpret_cast(setup_ws_ptr + offset); + offset += ptr_size; + ws.beta_ptrs = reinterpret_cast(setup_ws_ptr + offset); + offset += ptr_size; offset = ((offset + alignment - 1) / alignment) * alignment; @@ -1201,10 +1207,10 @@ struct GroupedGemmSetupWorkspace { // ----------------------------------------------------------------------------- // Helper routines to keep nvte_grouped_gemm readable // ----------------------------------------------------------------------------- -inline void validate_grouped_gemm_inputs(const transformer_engine::GroupedTensor* inputA, - const transformer_engine::GroupedTensor* inputB, - const transformer_engine::GroupedTensor* inputC, - const transformer_engine::GroupedTensor* outputD) { +inline void validate_grouped_gemm_inputs(const transformer_engine::GroupedTensor *inputA, + const transformer_engine::GroupedTensor *inputB, + const transformer_engine::GroupedTensor *inputC, + const transformer_engine::GroupedTensor *outputD) { const size_t num_tensors = inputA->num_tensors; NVTE_CHECK(num_tensors >= 1, "Grouped GEMM: num_tensors must be at least 1"); NVTE_CHECK(inputB->num_tensors == num_tensors, @@ -1235,23 +1241,24 @@ inline void validate_grouped_gemm_inputs(const transformer_engine::GroupedTensor // Mirrors the non-grouped GEMM logic for FP8 layout handling (TN-only on Hopper) and // fallback to column-wise data when row-wise is absent. struct GroupedOperandSelection { - const char* base = nullptr; + const char *base = nullptr; transformer_engine::DType dtype = transformer_engine::DType::kNumTypes; bool trans = false; bool use_columnwise = false; }; -inline GroupedOperandSelection select_grouped_operand(const transformer_engine::GroupedTensor* t, +inline GroupedOperandSelection select_grouped_operand(const transformer_engine::GroupedTensor *t, bool trans, bool is_A) { using namespace transformer_engine; const bool has_row = t->has_data(); const bool has_col = t->has_columnwise_data(); - NVTE_CHECK(has_row || has_col, "Grouped GEMM operand is missing both row-wise and column-wise data"); + NVTE_CHECK(has_row || has_col, + "Grouped GEMM operand is missing both row-wise and column-wise data"); // Not yet supported in grouped GEMM: block scaling, MXFP8, NVFP4 specialized layouts. const auto sm = t->scaling_mode; - NVTE_CHECK(sm != NVTE_BLOCK_SCALING_1D && sm != NVTE_BLOCK_SCALING_2D && - !is_mxfp_scaling(sm) && !is_nvfp_scaling(sm), + NVTE_CHECK(sm != NVTE_BLOCK_SCALING_1D && sm != NVTE_BLOCK_SCALING_2D && !is_mxfp_scaling(sm) && + !is_nvfp_scaling(sm), "Grouped GEMM does not yet support NVFP4/MXFP8/block scaling operand selection"); const DType row_dtype = t->data.dtype; @@ -1268,7 +1275,7 @@ inline GroupedOperandSelection select_grouped_operand(const transformer_engine:: if (is_A) { if (!sel.trans) { NVTE_CHECK(has_col, "Grouped GEMM: A is missing column-wise data needed for FP8 TN layout"); - sel.base = static_cast(t->columnwise_data.dptr); + sel.base = static_cast(t->columnwise_data.dptr); sel.dtype = col_dtype; sel.trans = true; // using pre-transposed storage sel.use_columnwise = true; @@ -1277,7 +1284,7 @@ inline GroupedOperandSelection select_grouped_operand(const transformer_engine:: } else { // B if (sel.trans) { NVTE_CHECK(has_col, "Grouped GEMM: B is missing column-wise data needed for FP8 TN layout"); - sel.base = static_cast(t->columnwise_data.dptr); + sel.base = static_cast(t->columnwise_data.dptr); sel.dtype = col_dtype; sel.trans = false; // using pre-transposed storage sel.use_columnwise = true; @@ -1288,7 +1295,7 @@ inline GroupedOperandSelection select_grouped_operand(const transformer_engine:: // If only column-wise data is available, mirror the transpose flag (pre-transposed storage). if (!has_row && has_col) { - sel.base = static_cast(t->columnwise_data.dptr); + sel.base = static_cast(t->columnwise_data.dptr); sel.dtype = col_dtype; sel.trans = !sel.trans; sel.use_columnwise = true; @@ -1296,81 +1303,81 @@ inline GroupedOperandSelection select_grouped_operand(const transformer_engine:: } // Default: use row-wise data (or column-wise if row-wise absent, covered above). - sel.base = static_cast(has_row ? t->data.dptr : t->columnwise_data.dptr); + sel.base = static_cast(has_row ? t->data.dptr : t->columnwise_data.dptr); sel.dtype = has_row ? row_dtype : col_dtype; - sel.use_columnwise = !has_row && has_col; + sel.use_columnwise = !has_row && has_col; return sel; } -inline void* validate_and_get_workspace_ptr(transformer_engine::Tensor* ws, size_t required_size, - const char* workspace_name) { +inline void *validate_and_get_workspace_ptr(transformer_engine::Tensor *ws, size_t required_size, + const char *workspace_name) { NVTE_CHECK(ws != nullptr, workspace_name, " tensor is null."); const size_t provided_size = get_buffer_size_bytes(ws->data.numel(), ws->data.dtype); - NVTE_CHECK(provided_size >= required_size, - "Grouped GEMM: Insufficient ", workspace_name, ". Required: ", required_size, - " bytes, Available: ", provided_size, " bytes."); + NVTE_CHECK(provided_size >= required_size, "Grouped GEMM: Insufficient ", workspace_name, + ". Required: ", required_size, " bytes, Available: ", provided_size, " bytes."); return ws->data.dptr; } -inline void init_matrix_layouts(cublasLtMatrixLayoutOpaque_t& descA, - cublasLtMatrixLayoutOpaque_t& descB, - cublasLtMatrixLayoutOpaque_t& descC, - cublasLtMatrixLayoutOpaque_t& descD, - const GroupedGemmWorkspace& ws, bool transa, bool transb, - bool a_columnwise, bool b_columnwise, +inline void init_matrix_layouts(cublasLtMatrixLayoutOpaque_t &descA, + cublasLtMatrixLayoutOpaque_t &descB, + cublasLtMatrixLayoutOpaque_t &descC, + cublasLtMatrixLayoutOpaque_t &descD, const GroupedGemmWorkspace &ws, + bool transa, bool transb, bool a_columnwise, bool b_columnwise, size_t num_tensors, cudaDataType_t A_type, cudaDataType_t B_type, cudaDataType_t D_type) { // For column-major layout: leading dimension is the number of rows in storage. // If columnwise data was chosen, storage is already transposed. - const int* rowa = a_columnwise ? ws.M : (transa ? ws.K : ws.M); - const int* cola = a_columnwise ? ws.K : (transa ? ws.M : ws.K); - const int* lda = rowa; - const int* rowb = b_columnwise ? ws.N : (transb ? ws.N : ws.K); - const int* colb = b_columnwise ? ws.K : (transb ? ws.K : ws.N); - const int* ldb = rowb; - - NVTE_CHECK_CUBLAS( - cublasLtGroupedMatrixLayoutInit(&descA, A_type, num_tensors, (void*)rowa, (void*)cola, (void*)lda)); - NVTE_CHECK_CUBLAS( - cublasLtGroupedMatrixLayoutInit(&descB, B_type, num_tensors, (void*)rowb, (void*)colb, (void*)ldb)); - NVTE_CHECK_CUBLAS( - cublasLtGroupedMatrixLayoutInit(&descC, D_type, num_tensors, (void*)ws.M, (void*)ws.N, (void*)ws.M)); - NVTE_CHECK_CUBLAS( - cublasLtGroupedMatrixLayoutInit(&descD, D_type, num_tensors, (void*)ws.M, (void*)ws.N, (void*)ws.M)); + const int *rowa = a_columnwise ? ws.M : (transa ? ws.K : ws.M); + const int *cola = a_columnwise ? ws.K : (transa ? ws.M : ws.K); + const int *lda = rowa; + const int *rowb = b_columnwise ? ws.N : (transb ? ws.N : ws.K); + const int *colb = b_columnwise ? ws.K : (transb ? ws.K : ws.N); + const int *ldb = rowb; + + NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descA, A_type, num_tensors, (void *)rowa, + (void *)cola, (void *)lda)); + NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descB, B_type, num_tensors, (void *)rowb, + (void *)colb, (void *)ldb)); + NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descC, D_type, num_tensors, (void *)ws.M, + (void *)ws.N, (void *)ws.M)); + NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descD, D_type, num_tensors, (void *)ws.M, + (void *)ws.N, (void *)ws.M)); } -inline void init_matmul_desc(cublasLtMatmulDescOpaque_t& matmulDesc, cublasOperation_t op_A, +inline void init_matmul_desc(cublasLtMatmulDescOpaque_t &matmulDesc, cublasOperation_t op_A, cublasOperation_t op_B) { NVTE_CHECK_CUBLAS(cublasLtMatmulDescInit(&matmulDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F)); - NVTE_CHECK_CUBLAS( - cublasLtMatmulDescSetAttribute(&matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &op_A, sizeof(op_A))); - NVTE_CHECK_CUBLAS( - cublasLtMatmulDescSetAttribute(&matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &op_B, sizeof(op_B))); + NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(&matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &op_A, + sizeof(op_A))); + NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(&matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &op_B, + sizeof(op_B))); cublasLtPointerMode_t pointer_mode = CUBLASLT_POINTER_MODE_DEVICE; NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(&matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &pointer_mode, sizeof(pointer_mode))); int64_t alphabeta_batch_stride = 1; - NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(&matmulDesc, CUBLASLT_MATMUL_DESC_ALPHA_BATCH_STRIDE, + NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(&matmulDesc, + CUBLASLT_MATMUL_DESC_ALPHA_BATCH_STRIDE, &alphabeta_batch_stride, sizeof(int64_t))); - NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(&matmulDesc, CUBLASLT_MATMUL_DESC_BETA_BATCH_STRIDE, + NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(&matmulDesc, + CUBLASLT_MATMUL_DESC_BETA_BATCH_STRIDE, &alphabeta_batch_stride, sizeof(int64_t))); } inline cublasLtMatmulAlgo_t select_grouped_gemm_algo(cublasLtHandle_t handle, - cublasLtMatmulDescOpaque_t& matmulDesc, - cublasLtMatrixLayoutOpaque_t& descA, - cublasLtMatrixLayoutOpaque_t& descB, - cublasLtMatrixLayoutOpaque_t& descC, - cublasLtMatrixLayoutOpaque_t& descD, int64_t avg_m, - int64_t avg_n, int64_t avg_k) { + cublasLtMatmulDescOpaque_t &matmulDesc, + cublasLtMatrixLayoutOpaque_t &descA, + cublasLtMatrixLayoutOpaque_t &descB, + cublasLtMatrixLayoutOpaque_t &descC, + cublasLtMatrixLayoutOpaque_t &descD, + int64_t avg_m, int64_t avg_n, int64_t avg_k) { cublasLtMatmulPreferenceOpaque_t preference; NVTE_CHECK_CUBLAS(cublasLtMatmulPreferenceInit(&preference)); - NVTE_CHECK_CUBLAS(cublasLtMatmulPreferenceSetAttribute( - &preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &kGroupedGemmCublasWorkspaceSize, - sizeof(size_t))); + NVTE_CHECK_CUBLAS( + cublasLtMatmulPreferenceSetAttribute(&preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, + &kGroupedGemmCublasWorkspaceSize, sizeof(size_t))); NVTE_CHECK_CUBLAS(cublasLtMatmulPreferenceSetAttribute( &preference, CUBLASLT_MATMUL_PREF_GROUPED_DESC_D_AVERAGE_ROWS, &avg_m, sizeof(int64_t))); NVTE_CHECK_CUBLAS(cublasLtMatmulPreferenceSetAttribute( @@ -1382,7 +1389,8 @@ inline cublasLtMatmulAlgo_t select_grouped_gemm_algo(cublasLtHandle_t handle, int returnedResults = 0; auto status = cublasLtMatmulAlgoGetHeuristic(handle, &matmulDesc, &descA, &descB, &descC, &descD, &preference, 1, &heuristicResult, &returnedResults); - NVTE_CHECK(status != CUBLAS_STATUS_NOT_SUPPORTED, "Unable to find suitable cuBLAS grouped GEMM algorithm"); + NVTE_CHECK(status != CUBLAS_STATUS_NOT_SUPPORTED, + "Unable to find suitable cuBLAS grouped GEMM algorithm"); NVTE_CHECK_CUBLAS(status); NVTE_CHECK(returnedResults > 0, "No suitable algorithm found for grouped GEMM"); return heuristicResult.algo; @@ -1394,14 +1402,12 @@ inline cublasLtMatmulAlgo_t select_grouped_gemm_algo(cublasLtHandle_t handle, // We bridge the mismatch on GPU by computing per-group pointers and dims in one kernel. __global__ void setup_grouped_gemm_kernel( // Output arrays - void **A_ptrs, void **B_ptrs, void **C_ptrs, void **D_ptrs, - int *M, int *N, int *K, + void **A_ptrs, void **B_ptrs, void **C_ptrs, void **D_ptrs, int *M, int *N, int *K, float **alpha_ptrs, float **beta_ptrs, // Base pointers const char *a_base, const char *b_base, const char *c_base, char *d_base, // Dimension info (per tensor) - TensorShapeInfo A_meta, TensorShapeInfo B_meta, - TensorShapeInfo C_meta, TensorShapeInfo D_meta, + TensorShapeInfo A_meta, TensorShapeInfo B_meta, TensorShapeInfo C_meta, TensorShapeInfo D_meta, // Element sizes size_t a_elem_size, size_t b_elem_size, size_t c_elem_size, size_t d_elem_size, // Alpha/beta pointers (same for all groups) @@ -1410,7 +1416,6 @@ __global__ void setup_grouped_gemm_kernel( bool transa, bool transb, // Number of tensors size_t num_tensors) { - size_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= num_tensors) return; @@ -1421,10 +1426,14 @@ __global__ void setup_grouped_gemm_kernel( int64_t b_last = B_meta.last_dims ? B_meta.last_dims[idx] : B_meta.uniform_last; // Compute offsets (from array or compute from uniform dims) - int64_t a_offset = A_meta.offsets ? A_meta.offsets[idx] : (idx * A_meta.uniform_first * A_meta.uniform_last); - int64_t b_offset = B_meta.offsets ? B_meta.offsets[idx] : (idx * B_meta.uniform_first * B_meta.uniform_last); - int64_t c_offset = C_meta.offsets ? C_meta.offsets[idx] : (idx * C_meta.uniform_first * C_meta.uniform_last); - int64_t d_offset = D_meta.offsets ? D_meta.offsets[idx] : (idx * D_meta.uniform_first * D_meta.uniform_last); + int64_t a_offset = + A_meta.offsets ? A_meta.offsets[idx] : (idx * A_meta.uniform_first * A_meta.uniform_last); + int64_t b_offset = + B_meta.offsets ? B_meta.offsets[idx] : (idx * B_meta.uniform_first * B_meta.uniform_last); + int64_t c_offset = + C_meta.offsets ? C_meta.offsets[idx] : (idx * C_meta.uniform_first * C_meta.uniform_last); + int64_t d_offset = + D_meta.offsets ? D_meta.offsets[idx] : (idx * D_meta.uniform_first * D_meta.uniform_last); // Compute data pointers A_ptrs[idx] = const_cast(a_base) + a_offset * a_elem_size; @@ -1444,18 +1453,12 @@ __global__ void setup_grouped_gemm_kernel( // Launch the setup kernel to populate workspace arrays inline void launch_grouped_gemm_setup( - const GroupedGemmWorkspace &ws, - const transformer_engine::GroupedTensor *A, - const transformer_engine::GroupedTensor *B, - const transformer_engine::GroupedTensor *C, - const transformer_engine::GroupedTensor *D, - const transformer_engine::Tensor *alpha_tensor, - const transformer_engine::Tensor *beta_tensor, - const char *a_base, const char *b_base, - size_t a_elem_size, size_t b_elem_size, - bool transa, bool transb, - size_t num_tensors, cudaStream_t stream) { - + const GroupedGemmWorkspace &ws, const transformer_engine::GroupedTensor *A, + const transformer_engine::GroupedTensor *B, const transformer_engine::GroupedTensor *C, + const transformer_engine::GroupedTensor *D, const transformer_engine::Tensor *alpha_tensor, + const transformer_engine::Tensor *beta_tensor, const char *a_base, const char *b_base, + size_t a_elem_size, size_t b_elem_size, bool transa, bool transb, size_t num_tensors, + cudaStream_t stream) { TensorShapeInfo A_meta = TensorShapeInfo::from_tensor(A); TensorShapeInfo B_meta = TensorShapeInfo::from_tensor(B); TensorShapeInfo C_meta = TensorShapeInfo::for_C(C, D); @@ -1471,15 +1474,10 @@ inline void launch_grouped_gemm_setup( const int num_blocks = (num_tensors + threads_per_block - 1) / threads_per_block; setup_grouped_gemm_kernel<<>>( - ws.A_ptrs, ws.B_ptrs, ws.C_ptrs, ws.D_ptrs, - ws.M, ws.N, ws.K, - ws.alpha_ptrs, ws.beta_ptrs, - a_base, b_base, c_base, d_base, - A_meta, B_meta, C_meta, D_meta, - a_elem_size, b_elem_size, c_elem_size, d_elem_size, - static_cast(alpha_tensor->data.dptr), - static_cast(beta_tensor->data.dptr), - transa, transb, num_tensors); + ws.A_ptrs, ws.B_ptrs, ws.C_ptrs, ws.D_ptrs, ws.M, ws.N, ws.K, ws.alpha_ptrs, ws.beta_ptrs, + a_base, b_base, c_base, d_base, A_meta, B_meta, C_meta, D_meta, a_elem_size, b_elem_size, + c_elem_size, d_elem_size, static_cast(alpha_tensor->data.dptr), + static_cast(beta_tensor->data.dptr), transa, transb, num_tensors); NVTE_CHECK_CUDA(cudaGetLastError()); } @@ -1492,12 +1490,11 @@ inline size_t grouped_gemm_setup_workspace_size(size_t num_tensors) { return GroupedGemmSetupWorkspace::required_setup_size(num_tensors, kGroupedGemmAlignment); } -void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, - const NVTEGroupedTensor A, const NVTEGroupedTensor B, - const NVTETensor beta, const NVTEGroupedTensor C, NVTEGroupedTensor D, - NVTETensor workspace_setup, NVTETensor workspace_cublas, - NVTEMatmulConfig config, cudaStream_t stream, - const int64_t* avg_m, const int64_t* avg_n, const int64_t* avg_k) { +void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, const NVTEGroupedTensor A, + const NVTEGroupedTensor B, const NVTETensor beta, const NVTEGroupedTensor C, + NVTEGroupedTensor D, NVTETensor workspace_setup, NVTETensor workspace_cublas, + NVTEMatmulConfig config, cudaStream_t stream, const int64_t *avg_m, + const int64_t *avg_n, const int64_t *avg_k) { NVTE_API_CALL(nvte_grouped_gemm); using namespace transformer_engine; @@ -1530,20 +1527,18 @@ void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, const size_t setup_workspace_size = grouped_gemm_setup_workspace_size(num_tensors); const size_t cublas_workspace_size = kGroupedGemmCublasWorkspaceSize; - void* setup_workspace_ptr = - validate_and_get_workspace_ptr(wspace_setup, setup_workspace_size, "Grouped GEMM setup workspace"); - void* cublas_workspace_ptr = - validate_and_get_workspace_ptr(wspace_cublas, cublas_workspace_size, "Grouped GEMM cuBLAS workspace"); + void *setup_workspace_ptr = validate_and_get_workspace_ptr(wspace_setup, setup_workspace_size, + "Grouped GEMM setup workspace"); + void *cublas_workspace_ptr = validate_and_get_workspace_ptr(wspace_cublas, cublas_workspace_size, + "Grouped GEMM cuBLAS workspace"); NVTE_CHECK(cublas_workspace_ptr != nullptr, "Grouped GEMM: cuBLAS workspace pointer is null"); auto setup_workspace = GroupedGemmSetupWorkspace::from_buffers( - static_cast(setup_workspace_ptr), num_tensors, kGroupedGemmAlignment); - launch_grouped_gemm_setup(setup_workspace, inputA, inputB, inputC, outputD, - alpha_tensor, beta_tensor, - A_sel.base, B_sel.base, a_elem_size, b_elem_size, - transa_flag, transb_flag, - num_tensors, stream); + static_cast(setup_workspace_ptr), num_tensors, kGroupedGemmAlignment); + launch_grouped_gemm_setup(setup_workspace, inputA, inputB, inputC, outputD, alpha_tensor, + beta_tensor, A_sel.base, B_sel.base, a_elem_size, b_elem_size, + transa_flag, transb_flag, num_tensors, stream); // Get cuBLAS handle using cublasHandleManager = detail::HandleManager; @@ -1560,9 +1555,9 @@ void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, // Create grouped matrix layouts cublasLtMatrixLayoutOpaque_t descA, descB, descC, descD; - init_matrix_layouts(descA, descB, descC, descD, setup_workspace, - transa_flag, transb_flag, A_sel.use_columnwise, B_sel.use_columnwise, - num_tensors, A_type, B_type, D_type); + init_matrix_layouts(descA, descB, descC, descD, setup_workspace, transa_flag, transb_flag, + A_sel.use_columnwise, B_sel.use_columnwise, num_tensors, A_type, B_type, + D_type); // Create matmul descriptor cublasLtMatmulDescOpaque_t matmulDesc; @@ -1576,15 +1571,13 @@ void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, avg_k ? *avg_k : (transa_flag ? compute_avg_first_dim(inputA) : compute_avg_last_dim(inputA)); // Heuristic selection - cublasLtMatmulAlgo_t algo = - select_grouped_gemm_algo(handle, matmulDesc, descA, descB, descC, descD, avg_m_val, avg_n_val, - avg_k_val); + cublasLtMatmulAlgo_t algo = select_grouped_gemm_algo(handle, matmulDesc, descA, descB, descC, + descD, avg_m_val, avg_n_val, avg_k_val); // Execute the grouped GEMM NVTE_CHECK_CUBLAS(cublasLtMatmul(handle, &matmulDesc, setup_workspace.alpha_ptrs, - setup_workspace.A_ptrs, &descA, setup_workspace.B_ptrs, &descB, - setup_workspace.beta_ptrs, setup_workspace.C_ptrs, - &descC, setup_workspace.D_ptrs, &descD, - &algo, cublas_workspace_ptr, + setup_workspace.A_ptrs, &descA, setup_workspace.B_ptrs, &descB, + setup_workspace.beta_ptrs, setup_workspace.C_ptrs, &descC, + setup_workspace.D_ptrs, &descD, &algo, cublas_workspace_ptr, kGroupedGemmCublasWorkspaceSize, stream)); } diff --git a/transformer_engine/common/include/transformer_engine/gemm.h b/transformer_engine/common/include/transformer_engine/gemm.h index 51241aef6b..948058295e 100644 --- a/transformer_engine/common/include/transformer_engine/gemm.h +++ b/transformer_engine/common/include/transformer_engine/gemm.h @@ -257,12 +257,11 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor * - Shape compatibility: if transa=false, transb=false: * - A[i]: (M[i], K[i]), B[i]: (K[i], N[i]), D[i]: (M[i], N[i]) */ -void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, - const NVTEGroupedTensor A, const NVTEGroupedTensor B, - const NVTETensor beta, const NVTEGroupedTensor C, NVTEGroupedTensor D, - NVTETensor workspace_setup, NVTETensor workspace_cublas, - NVTEMatmulConfig config, cudaStream_t stream, - const int64_t* avg_m, const int64_t* avg_n, const int64_t* avg_k); +void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, const NVTEGroupedTensor A, + const NVTEGroupedTensor B, const NVTETensor beta, const NVTEGroupedTensor C, + NVTEGroupedTensor D, NVTETensor workspace_setup, NVTETensor workspace_cublas, + NVTEMatmulConfig config, cudaStream_t stream, const int64_t *avg_m, + const int64_t *avg_n, const int64_t *avg_k); #ifdef __cplusplus } // extern "C" From 296d77362099c52fa8e19a299f4a4134dc184096 Mon Sep 17 00:00:00 2001 From: Pawel Gadzinski Date: Wed, 10 Dec 2025 18:25:39 +0100 Subject: [PATCH 05/98] Add FP8 scale support and fix alignment for grouped GEMM - Add FP8 scale_inv pointer handling in nvte_grouped_gemm for proper FP8 GEMM - Fix random padding in tests to ensure 16-byte alignment for all dtypes - Reorder GroupedGemmSetupWorkspace members for natural alignment - Remove debug prints Signed-off-by: Pawel Gadzinski --- tests/cpp/operator/test_grouped_gemm.cu | 55 +++++--- .../common/gemm/cublaslt_gemm.cu | 119 +++++++++++++----- .../common/include/transformer_engine/gemm.h | 2 + 3 files changed, 131 insertions(+), 45 deletions(-) diff --git a/tests/cpp/operator/test_grouped_gemm.cu b/tests/cpp/operator/test_grouped_gemm.cu index d346e06887..bff175f405 100644 --- a/tests/cpp/operator/test_grouped_gemm.cu +++ b/tests/cpp/operator/test_grouped_gemm.cu @@ -1,8 +1,8 @@ -/*********************************************************************** - * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. +/************************************************************************* + * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See LICENSE for license information. - **********************************************************************/ + ************************************************************************/ #include #include @@ -16,6 +16,8 @@ #include #include +#include +#include #include #include "../test_common.h" @@ -136,7 +138,7 @@ GroupedBuffers build_grouped_tensor(const std::vector& tensors, const NVTEShape shape = tensors[0]->rowwise_shape(); const DType dtype = tensors[0]->dtype(); const size_t num_tensors = tensors.size(); - const size_t elem_size = typeToSize(dtype); + const size_t elem_size = typeToNumBits(dtype) / 8; GroupedBuffers grouped; grouped.elem_size = elem_size; grouped.num_tensors = num_tensors; @@ -162,9 +164,13 @@ GroupedBuffers build_grouped_tensor(const std::vector& tensors, std::vector offsets(num_tensors, 0); auto random_padding = [&]() -> int64_t { + // Random padding ensuring 16-byte alignment regardless of element size + // cuBLAS requires aligned pointers for vectorized loads static std::mt19937 gen(12345); std::uniform_int_distribution dist(0, 3); - return dist(gen); + // Calculate elements needed for 16-byte alignment + const size_t align_elements = (16 * 8) / typeToNumBits(dtype); // 16 bytes / element_size + return dist(gen) * static_cast(align_elements); }; auto numel = [&](size_t idx) -> int64_t { @@ -301,7 +307,12 @@ Tensor make_fp8_operand(const std::string& name, const std::vector& shap Tensor make_bf16_operand(const std::string& name, const std::vector& shape) { Tensor t(name, shape, DType::kBFloat16); - fillUniform(&t); + // Fill with ones for easier debugging + //fillUniform(&t); + const size_t numel = shape[0] * shape[1]; + std::vector<__nv_bfloat16> ones(numel, __float2bfloat16(1.0f)); + NVTE_CHECK_CUDA(cudaMemcpy(t.rowwise_dptr(), ones.data(), + numel * sizeof(__nv_bfloat16), cudaMemcpyHostToDevice)); return t; } @@ -312,17 +323,21 @@ struct TestParams { ShapeCase shape_case; }; +// Returns a vector of (M, N, K) tuples for each GEMM in the group. +// M - number of rows in output D +// N - number of columns in output D +// K - reduction dimension shared between A and B std::vector> make_shapes(ShapeCase scase) { switch (scase) { case ShapeCase::kAllSame: return {{64, 64, 32}, {64, 64, 32}, {64, 64, 32}}; - case ShapeCase::kSameFirst: // M wspólne, N/K zróżnicowane - return {{64, 64, 32}, {64, 96, 32}, {64, 80, 48}}; - case ShapeCase::kSameLast: // N wspólne, M/K zróżnicowane - return {{48, 80, 32}, {96, 80, 48}, {72, 80, 40}}; + case ShapeCase::kSameFirst: + return {{64, 80, 32}, {64, 80, 48}, {64, 80, 64}}; + case ShapeCase::kSameLast: + return {{64, 80, 32}, {64, 80, 48}, {64, 80, 64}}; case ShapeCase::kAllDifferent: default: - return {{48, 80, 32}, {96, 64, 48}, {40, 72, 24}}; + return {{64, 96, 32}, {64, 96, 48}, {64, 96, 64}}; } } @@ -345,10 +360,10 @@ void run_grouped_gemm_case(const TestParams& params) { for (size_t i = 0; i < num_gemms; ++i) { const auto [M, N, K] = shapes[i]; - const std::vector a_shape = params.transa ? std::vector{K, M} - : std::vector{M, K}; - const std::vector b_shape = params.transb ? std::vector{N, K} - : std::vector{K, N}; + const std::vector a_shape = params.transa ? std::vector{M, K} + : std::vector{K, M}; + const std::vector b_shape = params.transb ? std::vector{K, N} + : std::vector{N, K}; switch (params.input_case) { case InputCase::kFP8Current: { A_tensors.emplace_back(make_fp8_operand("A" + std::to_string(i), a_shape)); @@ -373,6 +388,10 @@ void run_grouped_gemm_case(const TestParams& params) { std::vector gelu_ptrs(num_gemms, nullptr); std::vector workspaces(num_gemms); std::vector workspace_ptrs(num_gemms, nullptr); + std::vector A_views; + std::vector B_views; + A_views.reserve(num_gemms); + B_views.reserve(num_gemms); const size_t cublas_ws_bytes = 32ull * 1024 * 1024; @@ -382,6 +401,8 @@ void run_grouped_gemm_case(const TestParams& params) { D_ptrs[i] = D_multi[i].data(); workspaces[i] = Tensor("workspace" + std::to_string(i), std::vector{cublas_ws_bytes}, DType::kByte); workspace_ptrs[i] = workspaces[i].data(); + A_views.push_back(&A_tensors[i]); + B_views.push_back(&B_tensors[i]); } nvte_multi_tensor_gemm(A_ptrs.data(), @@ -399,8 +420,8 @@ void run_grouped_gemm_case(const TestParams& params) { 0, 0); - GroupedBuffers grouped_A = build_grouped_tensor(A_tensors, A_tensors[0].scaling_mode()); - GroupedBuffers grouped_B = build_grouped_tensor(B_tensors, B_tensors[0].scaling_mode()); + GroupedBuffers grouped_A = build_grouped_tensor(A_views, A_tensors[0].scaling_mode()); + GroupedBuffers grouped_B = build_grouped_tensor(B_views, B_tensors[0].scaling_mode()); std::vector C_tensors; std::vector D_group_tensors; diff --git a/transformer_engine/common/gemm/cublaslt_gemm.cu b/transformer_engine/common/gemm/cublaslt_gemm.cu index 2c8c2093c6..bb29d58de4 100644 --- a/transformer_engine/common/gemm/cublaslt_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_gemm.cu @@ -1115,20 +1115,50 @@ struct TensorShapeInfo { // Create from GroupedTensor static TensorShapeInfo from_tensor(const transformer_engine::GroupedTensor *t) { - return {t->first_dims.has_data() ? static_cast(t->first_dims.dptr) : nullptr, - t->last_dims.has_data() ? static_cast(t->last_dims.dptr) : nullptr, + const bool has_first = t->first_dims.has_data(); + const bool has_last = t->last_dims.has_data(); + // When per-tensor dims are not provided, we must be in the uniform-shape case. + NVTE_CHECK(has_first || t->all_same_first_dim(), + "GroupedTensor is missing first_dims for varying shapes"); + NVTE_CHECK(has_last || t->all_same_last_dim(), + "GroupedTensor is missing last_dims for varying shapes"); + + const int64_t *first_ptr = has_first ? static_cast(t->first_dims.dptr) : nullptr; + const int64_t *last_ptr = has_last ? static_cast(t->last_dims.dptr) : nullptr; + + const int64_t uniform_first = has_first ? 0 : static_cast(t->get_common_first_dim()); + const int64_t uniform_last = has_last ? 0 : static_cast(t->get_common_last_dim()); + + return {first_ptr, + last_ptr, t->tensor_offsets.has_data() ? static_cast(t->tensor_offsets.dptr) : nullptr, - t->get_common_first_dim(), t->get_common_last_dim()}; + uniform_first, + uniform_last}; } // Create for C tensor (uses D's dimensions, only has offsets) static TensorShapeInfo for_C(const transformer_engine::GroupedTensor *C, const transformer_engine::GroupedTensor *D) { - return {nullptr, nullptr, + const bool has_first = D->first_dims.has_data(); + const bool has_last = D->last_dims.has_data(); + NVTE_CHECK(has_first || D->all_same_first_dim(), + "GroupedTensor D is missing first_dims for varying shapes"); + NVTE_CHECK(has_last || D->all_same_last_dim(), + "GroupedTensor D is missing last_dims for varying shapes"); + + const int64_t *first_ptr = + has_first ? static_cast(D->first_dims.dptr) : nullptr; + const int64_t *last_ptr = has_last ? static_cast(D->last_dims.dptr) : nullptr; + const int64_t uniform_first = has_first ? 0 : static_cast(D->get_common_first_dim()); + const int64_t uniform_last = has_last ? 0 : static_cast(D->get_common_last_dim()); + + return {first_ptr, + last_ptr, C->tensor_offsets.has_data() ? static_cast(C->tensor_offsets.dptr) : nullptr, - D->get_common_first_dim(), D->get_common_last_dim()}; + uniform_first, + uniform_last}; } }; @@ -1144,10 +1174,9 @@ inline int64_t compute_avg_last_dim(const transformer_engine::GroupedTensor *t) if (t->all_same_last_dim()) { // logical_shape[1] is the common N return static_cast(t->logical_shape.data[1]); - } else { - // logical_shape[1] is sum_of_N, divide by num_tensors - return static_cast(t->logical_shape.data[1]) / static_cast(t->num_tensors); } + // When varying, logical_shape[1] should be sum of last dims if provided; otherwise fallback to avg via division. + return static_cast(t->logical_shape.data[1]) / static_cast(t->num_tensors); } // Workspace layout for grouped GEMM @@ -1163,6 +1192,7 @@ struct GroupedGemmSetupWorkspace { float **beta_ptrs; // Initialize from workspace buffer + // Layout: all pointer arrays first (8-byte aligned), then int arrays (4-byte aligned) static GroupedGemmSetupWorkspace from_buffers(char *setup_ws_ptr, size_t num_tensors, size_t alignment) { GroupedGemmSetupWorkspace ws; @@ -1170,6 +1200,7 @@ struct GroupedGemmSetupWorkspace { const size_t ptr_size = num_tensors * sizeof(void *); const size_t int_size = num_tensors * sizeof(int); + // Pointer arrays first (all 8-byte aligned) ws.A_ptrs = reinterpret_cast(setup_ws_ptr + offset); offset += ptr_size; ws.B_ptrs = reinterpret_cast(setup_ws_ptr + offset); @@ -1178,27 +1209,30 @@ struct GroupedGemmSetupWorkspace { offset += ptr_size; ws.D_ptrs = reinterpret_cast(setup_ws_ptr + offset); offset += ptr_size; + ws.alpha_ptrs = reinterpret_cast(setup_ws_ptr + offset); + offset += ptr_size; + ws.beta_ptrs = reinterpret_cast(setup_ws_ptr + offset); + offset += ptr_size; + + // Int arrays last (4-byte aligned, always satisfied after pointer arrays) ws.M = reinterpret_cast(setup_ws_ptr + offset); offset += int_size; ws.N = reinterpret_cast(setup_ws_ptr + offset); offset += int_size; ws.K = reinterpret_cast(setup_ws_ptr + offset); offset += int_size; - ws.alpha_ptrs = reinterpret_cast(setup_ws_ptr + offset); - offset += ptr_size; - ws.beta_ptrs = reinterpret_cast(setup_ws_ptr + offset); - offset += ptr_size; offset = ((offset + alignment - 1) / alignment) * alignment; return ws; } - // Calculate required size for setup workspace (pointer arrays + M/N/K + alpha/beta ptrs) + // Calculate required size for setup workspace (pointer arrays + M/N/K) static size_t required_setup_size(size_t num_tensors, size_t alignment) { const size_t ptr_size = num_tensors * sizeof(void *); const size_t int_size = num_tensors * sizeof(int); - size_t size = 4 * ptr_size + 3 * int_size + 2 * ptr_size; // M, N, K only (no LDA/LDB/LDC/LDD) + // Layout: 6 ptr arrays, then 3 int arrays (no padding needed) + size_t size = 6 * ptr_size + 3 * int_size; size = ((size + alignment - 1) / alignment) * alignment; return size; } @@ -1220,12 +1254,16 @@ inline void validate_grouped_gemm_inputs(const transformer_engine::GroupedTensor NVTE_CHECK(outputD->num_tensors == num_tensors, "Grouped GEMM: A and D must have the same num_tensors"); - auto is_fp8_or_16bit = [](DType dtype) { - return dtype == DType::kFloat8E4M3 || dtype == DType::kFloat8E5M2 || - dtype == DType::kBFloat16 || dtype == DType::kFloat16; + auto is_fp8_or_16bit = [](transformer_engine::DType dtype) { + return dtype == transformer_engine::DType::kFloat8E4M3 || + dtype == transformer_engine::DType::kFloat8E5M2 || + dtype == transformer_engine::DType::kBFloat16 || + dtype == transformer_engine::DType::kFloat16; }; - auto is_output_dtype = [](DType dtype) { - return dtype == DType::kBFloat16 || dtype == DType::kFloat16 || dtype == DType::kFloat32; + auto is_output_dtype = [](transformer_engine::DType dtype) { + return dtype == transformer_engine::DType::kBFloat16 || + dtype == transformer_engine::DType::kFloat16 || + dtype == transformer_engine::DType::kFloat32; }; NVTE_CHECK(is_fp8_or_16bit(inputA->dtype()) && is_fp8_or_16bit(inputB->dtype()), "Grouped GEMM inputs must be FP8, BF16, or FP16."); @@ -1321,7 +1359,8 @@ inline void *validate_and_get_workspace_ptr(transformer_engine::Tensor *ws, size inline void init_matrix_layouts(cublasLtMatrixLayoutOpaque_t &descA, cublasLtMatrixLayoutOpaque_t &descB, cublasLtMatrixLayoutOpaque_t &descC, - cublasLtMatrixLayoutOpaque_t &descD, const GroupedGemmWorkspace &ws, + cublasLtMatrixLayoutOpaque_t &descD, + const GroupedGemmSetupWorkspace &ws, bool transa, bool transb, bool a_columnwise, bool b_columnwise, size_t num_tensors, cudaDataType_t A_type, cudaDataType_t B_type, cudaDataType_t D_type) { @@ -1366,6 +1405,10 @@ inline void init_matmul_desc(cublasLtMatmulDescOpaque_t &matmulDesc, cublasOpera &alphabeta_batch_stride, sizeof(int64_t))); } +// Constants for grouped GEMM workspace (declared early for use in heuristics) +static constexpr size_t kGroupedGemmAlignment = 256; +static constexpr size_t kGroupedGemmCublasWorkspaceSize = 32ull * 1024 * 1024; // 32 MiB + inline cublasLtMatmulAlgo_t select_grouped_gemm_algo(cublasLtHandle_t handle, cublasLtMatmulDescOpaque_t &matmulDesc, cublasLtMatrixLayoutOpaque_t &descA, @@ -1442,9 +1485,11 @@ __global__ void setup_grouped_gemm_kernel( D_ptrs[idx] = d_base + d_offset * d_elem_size; // Compute M, N, K dimensions - M[idx] = static_cast(transa ? a_last : a_first); - K[idx] = static_cast(transa ? a_first : a_last); - N[idx] = static_cast(transb ? b_first : b_last); + // Test stores A as {K,M} when !transa, {M,K} when transa + // Test stores B as {N,K} when !transb, {K,N} when transb + M[idx] = static_cast(transa ? a_first : a_last); + K[idx] = static_cast(transa ? a_last : a_first); + N[idx] = static_cast(transb ? b_last : b_first); // Fill alpha/beta pointers (same for all groups) alpha_ptrs[idx] = alpha_ptr; @@ -1453,7 +1498,7 @@ __global__ void setup_grouped_gemm_kernel( // Launch the setup kernel to populate workspace arrays inline void launch_grouped_gemm_setup( - const GroupedGemmWorkspace &ws, const transformer_engine::GroupedTensor *A, + const GroupedGemmSetupWorkspace &ws, const transformer_engine::GroupedTensor *A, const transformer_engine::GroupedTensor *B, const transformer_engine::GroupedTensor *C, const transformer_engine::GroupedTensor *D, const transformer_engine::Tensor *alpha_tensor, const transformer_engine::Tensor *beta_tensor, const char *a_base, const char *b_base, @@ -1482,10 +1527,6 @@ inline void launch_grouped_gemm_setup( NVTE_CHECK_CUDA(cudaGetLastError()); } -// Constants for grouped GEMM workspace -static constexpr size_t kGroupedGemmAlignment = 256; -static constexpr size_t kGroupedGemmCublasWorkspaceSize = 32ull * 1024 * 1024; // 32 MiB - inline size_t grouped_gemm_setup_workspace_size(size_t num_tensors) { return GroupedGemmSetupWorkspace::required_setup_size(num_tensors, kGroupedGemmAlignment); } @@ -1563,6 +1604,28 @@ void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, const NVT cublasLtMatmulDescOpaque_t matmulDesc; init_matmul_desc(matmulDesc, op_A, op_B); + // Set FP8 scale pointers if needed + const bool is_fp8_a = is_fp8_dtype(A_sel.dtype); + const bool is_fp8_b = is_fp8_dtype(B_sel.dtype); + if (is_fp8_a || is_fp8_b) { + // For FP8 grouped GEMM, we need to pass scale_inv pointers + // The scale_inv arrays contain one float per tensor in the group + if (is_fp8_a) { + void *a_scale_inv = A_sel.use_columnwise ? inputA->columnwise_scale_inv.dptr + : inputA->scale_inv.dptr; + NVTE_CHECK(a_scale_inv != nullptr, "FP8 grouped GEMM: A scale_inv is required"); + NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + &matmulDesc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &a_scale_inv, sizeof(a_scale_inv))); + } + if (is_fp8_b) { + void *b_scale_inv = B_sel.use_columnwise ? inputB->columnwise_scale_inv.dptr + : inputB->scale_inv.dptr; + NVTE_CHECK(b_scale_inv != nullptr, "FP8 grouped GEMM: B scale_inv is required"); + NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + &matmulDesc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &b_scale_inv, sizeof(b_scale_inv))); + } + } + // Compute average dimensions for heuristics // K dimension: if transa, K is A's first dim; if not, K is A's last dim int64_t avg_m_val = avg_m ? *avg_m : compute_avg_first_dim(outputD); diff --git a/transformer_engine/common/include/transformer_engine/gemm.h b/transformer_engine/common/include/transformer_engine/gemm.h index 948058295e..246fb5fefd 100644 --- a/transformer_engine/common/include/transformer_engine/gemm.h +++ b/transformer_engine/common/include/transformer_engine/gemm.h @@ -11,6 +11,8 @@ #ifndef TRANSFORMER_ENGINE_GEMM_H_ #define TRANSFORMER_ENGINE_GEMM_H_ +#include + #include "transformer_engine.h" #ifdef __cplusplus From 785df3440a443b72340dfdf33db7391280e3a968 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 10 Dec 2025 17:26:49 +0000 Subject: [PATCH 06/98] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../common/gemm/cublaslt_gemm.cu | 29 +++++++++---------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/transformer_engine/common/gemm/cublaslt_gemm.cu b/transformer_engine/common/gemm/cublaslt_gemm.cu index bb29d58de4..55f52a1c4d 100644 --- a/transformer_engine/common/gemm/cublaslt_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_gemm.cu @@ -1123,18 +1123,17 @@ struct TensorShapeInfo { NVTE_CHECK(has_last || t->all_same_last_dim(), "GroupedTensor is missing last_dims for varying shapes"); - const int64_t *first_ptr = has_first ? static_cast(t->first_dims.dptr) : nullptr; + const int64_t *first_ptr = + has_first ? static_cast(t->first_dims.dptr) : nullptr; const int64_t *last_ptr = has_last ? static_cast(t->last_dims.dptr) : nullptr; const int64_t uniform_first = has_first ? 0 : static_cast(t->get_common_first_dim()); const int64_t uniform_last = has_last ? 0 : static_cast(t->get_common_last_dim()); - return {first_ptr, - last_ptr, + return {first_ptr, last_ptr, t->tensor_offsets.has_data() ? static_cast(t->tensor_offsets.dptr) : nullptr, - uniform_first, - uniform_last}; + uniform_first, uniform_last}; } // Create for C tensor (uses D's dimensions, only has offsets) @@ -1153,12 +1152,10 @@ struct TensorShapeInfo { const int64_t uniform_first = has_first ? 0 : static_cast(D->get_common_first_dim()); const int64_t uniform_last = has_last ? 0 : static_cast(D->get_common_last_dim()); - return {first_ptr, - last_ptr, + return {first_ptr, last_ptr, C->tensor_offsets.has_data() ? static_cast(C->tensor_offsets.dptr) : nullptr, - uniform_first, - uniform_last}; + uniform_first, uniform_last}; } }; @@ -1360,9 +1357,9 @@ inline void init_matrix_layouts(cublasLtMatrixLayoutOpaque_t &descA, cublasLtMatrixLayoutOpaque_t &descB, cublasLtMatrixLayoutOpaque_t &descC, cublasLtMatrixLayoutOpaque_t &descD, - const GroupedGemmSetupWorkspace &ws, - bool transa, bool transb, bool a_columnwise, bool b_columnwise, - size_t num_tensors, cudaDataType_t A_type, cudaDataType_t B_type, + const GroupedGemmSetupWorkspace &ws, bool transa, bool transb, + bool a_columnwise, bool b_columnwise, size_t num_tensors, + cudaDataType_t A_type, cudaDataType_t B_type, cudaDataType_t D_type) { // For column-major layout: leading dimension is the number of rows in storage. // If columnwise data was chosen, storage is already transposed. @@ -1611,15 +1608,15 @@ void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, const NVT // For FP8 grouped GEMM, we need to pass scale_inv pointers // The scale_inv arrays contain one float per tensor in the group if (is_fp8_a) { - void *a_scale_inv = A_sel.use_columnwise ? inputA->columnwise_scale_inv.dptr - : inputA->scale_inv.dptr; + void *a_scale_inv = + A_sel.use_columnwise ? inputA->columnwise_scale_inv.dptr : inputA->scale_inv.dptr; NVTE_CHECK(a_scale_inv != nullptr, "FP8 grouped GEMM: A scale_inv is required"); NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( &matmulDesc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &a_scale_inv, sizeof(a_scale_inv))); } if (is_fp8_b) { - void *b_scale_inv = B_sel.use_columnwise ? inputB->columnwise_scale_inv.dptr - : inputB->scale_inv.dptr; + void *b_scale_inv = + B_sel.use_columnwise ? inputB->columnwise_scale_inv.dptr : inputB->scale_inv.dptr; NVTE_CHECK(b_scale_inv != nullptr, "FP8 grouped GEMM: B scale_inv is required"); NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( &matmulDesc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &b_scale_inv, sizeof(b_scale_inv))); From 1329b3746abfe3f9d845e90da7945bede6e3893c Mon Sep 17 00:00:00 2001 From: Pawel Gadzinski Date: Wed, 10 Dec 2025 22:34:16 +0100 Subject: [PATCH 07/98] fix Signed-off-by: Pawel Gadzinski --- .../common/gemm/cublaslt_gemm.cu | 33 +++++++++---------- 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/transformer_engine/common/gemm/cublaslt_gemm.cu b/transformer_engine/common/gemm/cublaslt_gemm.cu index 55f52a1c4d..3662247b51 100644 --- a/transformer_engine/common/gemm/cublaslt_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_gemm.cu @@ -1217,9 +1217,6 @@ struct GroupedGemmSetupWorkspace { ws.N = reinterpret_cast(setup_ws_ptr + offset); offset += int_size; ws.K = reinterpret_cast(setup_ws_ptr + offset); - offset += int_size; - - offset = ((offset + alignment - 1) / alignment) * alignment; return ws; } @@ -1363,21 +1360,21 @@ inline void init_matrix_layouts(cublasLtMatrixLayoutOpaque_t &descA, cudaDataType_t D_type) { // For column-major layout: leading dimension is the number of rows in storage. // If columnwise data was chosen, storage is already transposed. - const int *rowa = a_columnwise ? ws.M : (transa ? ws.K : ws.M); - const int *cola = a_columnwise ? ws.K : (transa ? ws.M : ws.K); - const int *lda = rowa; - const int *rowb = b_columnwise ? ws.N : (transb ? ws.N : ws.K); - const int *colb = b_columnwise ? ws.K : (transb ? ws.K : ws.N); - const int *ldb = rowb; - - NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descA, A_type, num_tensors, (void *)rowa, - (void *)cola, (void *)lda)); - NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descB, B_type, num_tensors, (void *)rowb, - (void *)colb, (void *)ldb)); - NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descC, D_type, num_tensors, (void *)ws.M, - (void *)ws.N, (void *)ws.M)); - NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descD, D_type, num_tensors, (void *)ws.M, - (void *)ws.N, (void *)ws.M)); + int *rowa = a_columnwise ? ws.M : (transa ? ws.K : ws.M); + int *cola = a_columnwise ? ws.K : (transa ? ws.M : ws.K); + int *lda = rowa; + int *rowb = b_columnwise ? ws.N : (transb ? ws.N : ws.K); + int *colb = b_columnwise ? ws.K : (transb ? ws.K : ws.N); + int *ldb = rowb; + + NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descA, A_type, num_tensors, + rowa, cola, lda)); + NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descB, B_type, num_tensors, + rowb, colb, ldb)); + NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descC, D_type, num_tensors, + ws.M, ws.N, ws.M)); + NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descD, D_type, num_tensors, + ws.M, ws.N, ws.M)); } inline void init_matmul_desc(cublasLtMatmulDescOpaque_t &matmulDesc, cublasOperation_t op_A, From 47c58be8ce0ee14fc26a90a2f8b3ad8035283b4c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 10 Dec 2025 21:35:06 +0000 Subject: [PATCH 08/98] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- transformer_engine/common/gemm/cublaslt_gemm.cu | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/transformer_engine/common/gemm/cublaslt_gemm.cu b/transformer_engine/common/gemm/cublaslt_gemm.cu index 3662247b51..91405bd42f 100644 --- a/transformer_engine/common/gemm/cublaslt_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_gemm.cu @@ -1367,14 +1367,10 @@ inline void init_matrix_layouts(cublasLtMatrixLayoutOpaque_t &descA, int *colb = b_columnwise ? ws.K : (transb ? ws.K : ws.N); int *ldb = rowb; - NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descA, A_type, num_tensors, - rowa, cola, lda)); - NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descB, B_type, num_tensors, - rowb, colb, ldb)); - NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descC, D_type, num_tensors, - ws.M, ws.N, ws.M)); - NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descD, D_type, num_tensors, - ws.M, ws.N, ws.M)); + NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descA, A_type, num_tensors, rowa, cola, lda)); + NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descB, B_type, num_tensors, rowb, colb, ldb)); + NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descC, D_type, num_tensors, ws.M, ws.N, ws.M)); + NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descD, D_type, num_tensors, ws.M, ws.N, ws.M)); } inline void init_matmul_desc(cublasLtMatmulDescOpaque_t &matmulDesc, cublasOperation_t op_A, From a155a8a3dd17663c82882f64b30a5a118ba3695b Mon Sep 17 00:00:00 2001 From: Pawel Gadzinski Date: Thu, 11 Dec 2025 11:55:44 +0100 Subject: [PATCH 09/98] Grouped GEMM: code cleanup and NULL C support - Remove unused alignment parameter from GroupedGemmSetupWorkspace::from_buffers - Simplify select_grouped_operand by removing dead code branches - Add GroupedOperandSelection.tensor field to avoid passing tensor separately - Extract set_fp8_scale_pointers and init_matrix_layouts helpers - Add safety check for FP8 on Hopper column-wise fallback - Support NULL C tensor when beta=0 (uses D as placeholder) - Remove unused get_scale_inv() from test - Add use_null_c test parameter and test case - Fix documentation: alpha/beta are single element tensors only Signed-off-by: Piotr Gadzinski Signed-off-by: Pawel Gadzinski --- tests/cpp/operator/test_grouped_gemm.cu | 210 ++++++++---------- .../common/gemm/cublaslt_gemm.cu | 163 +++++++------- .../common/include/transformer_engine/gemm.h | 34 +-- 3 files changed, 203 insertions(+), 204 deletions(-) diff --git a/tests/cpp/operator/test_grouped_gemm.cu b/tests/cpp/operator/test_grouped_gemm.cu index bff175f405..5e5144fa4c 100644 --- a/tests/cpp/operator/test_grouped_gemm.cu +++ b/tests/cpp/operator/test_grouped_gemm.cu @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -28,7 +29,6 @@ using namespace test; namespace { enum class InputCase { - kFP8Delayed, kFP8Current, kBF16, }; @@ -40,17 +40,37 @@ enum class ShapeCase { kAllDifferent, }; +// Custom deleters for RAII +struct CudaDeleter { + void operator()(void* p) const { if (p) cudaFree(p); } +}; +struct GroupedTensorDeleter { + void operator()(NVTEGroupedTensor h) const { if (h) nvte_destroy_grouped_tensor(h); } +}; + +template +using CudaPtr = std::unique_ptr; +using GroupedTensorHandle = std::unique_ptr, GroupedTensorDeleter>; + +// Helper to allocate CUDA memory into a CudaPtr +template +CudaPtr cuda_alloc(size_t bytes) { + void* ptr = nullptr; + NVTE_CHECK_CUDA(cudaMalloc(&ptr, bytes)); + return CudaPtr(static_cast(ptr)); +} + // Helper owning GPU buffers that back NVTEGroupedTensor. // NVTEGroupedTensor does not own memory; data/offsets/scales // must be allocated and freed by the test. struct GroupedBuffers { - NVTEGroupedTensor handle{nullptr}; - void* data{nullptr}; - void* scale_inv{nullptr}; - int64_t* first_dims_dev{nullptr}; - int64_t* last_dims_dev{nullptr}; - int64_t* offsets_dev{nullptr}; - void* columnwise_data{nullptr}; + GroupedTensorHandle handle; + CudaPtr<> data; + CudaPtr<> scale_inv; + CudaPtr first_dims_dev; + CudaPtr last_dims_dev; + CudaPtr offsets_dev; + CudaPtr<> columnwise_data; NVTEShape logical_shape{}; std::vector offsets_host; std::vector tensor_bytes; @@ -62,65 +82,13 @@ struct GroupedBuffers { GroupedBuffers() = default; GroupedBuffers(const GroupedBuffers&) = delete; GroupedBuffers& operator=(const GroupedBuffers&) = delete; - GroupedBuffers(GroupedBuffers&& other) noexcept { - *this = std::move(other); - } - GroupedBuffers& operator=(GroupedBuffers&& other) noexcept { - if (this == &other) return *this; - handle = other.handle; - data = other.data; - scale_inv = other.scale_inv; - first_dims_dev = other.first_dims_dev; - last_dims_dev = other.last_dims_dev; - offsets_dev = other.offsets_dev; - logical_shape = other.logical_shape; - offsets_host = std::move(other.offsets_host); - tensor_bytes = std::move(other.tensor_bytes); - num_tensors = other.num_tensors; - elem_size = other.elem_size; - dtype = other.dtype; - scaling_mode = other.scaling_mode; - - other.handle = nullptr; - other.data = nullptr; - other.scale_inv = nullptr; - other.first_dims_dev = nullptr; - other.last_dims_dev = nullptr; - other.offsets_dev = nullptr; - other.num_tensors = 0; - return *this; - } + GroupedBuffers(GroupedBuffers&&) = default; + GroupedBuffers& operator=(GroupedBuffers&&) = default; + ~GroupedBuffers() = default; - ~GroupedBuffers() { - if (data) { - cudaFree(data); - data = nullptr; - } - if (scale_inv) { - cudaFree(scale_inv); - scale_inv = nullptr; - } - if (columnwise_data) { - cudaFree(columnwise_data); - columnwise_data = nullptr; - } - if (first_dims_dev) { - cudaFree(first_dims_dev); - first_dims_dev = nullptr; - } - if (last_dims_dev) { - cudaFree(last_dims_dev); - last_dims_dev = nullptr; - } - if (offsets_dev) { - cudaFree(offsets_dev); - offsets_dev = nullptr; - } - if (handle) { - nvte_destroy_grouped_tensor(handle); - handle = nullptr; - } - } + // Convenience accessors for raw pointers + NVTEGroupedTensor get_handle() const { return handle.get(); } + void* get_data() const { return data.get(); } }; size_t grouped_setup_workspace_size(const size_t num_tensors) { @@ -211,7 +179,7 @@ GroupedBuffers build_grouped_tensor(const std::vector& tensors, size_t logical_data[2] = {static_cast(logical_first), static_cast(logical_last)}; grouped.logical_shape = nvte_make_shape(logical_data, 2); - grouped.handle = nvte_create_grouped_tensor(scaling_mode, num_tensors, grouped.logical_shape); + grouped.handle.reset(nvte_create_grouped_tensor(scaling_mode, num_tensors, grouped.logical_shape)); const int64_t last_idx = static_cast(num_tensors - 1); const int64_t total_elems = need_offsets @@ -219,59 +187,60 @@ GroupedBuffers build_grouped_tensor(const std::vector& tensors, : (logical_first * logical_last); const size_t total_bytes = static_cast(total_elems) * elem_size; - NVTE_CHECK_CUDA(cudaMalloc(&grouped.data, total_bytes)); + grouped.data = cuda_alloc(total_bytes); for (size_t i = 0; i < num_tensors; ++i) { const size_t offset_bytes = static_cast(offsets[i]) * elem_size; - NVTE_CHECK_CUDA(cudaMemcpy(static_cast(grouped.data) + offset_bytes, + NVTE_CHECK_CUDA(cudaMemcpy(static_cast(grouped.data.get()) + offset_bytes, tensors[i]->rowwise_dptr(), grouped.tensor_bytes[i], cudaMemcpyDeviceToDevice)); } - NVTEBasicTensor data_tensor{grouped.data, static_cast(dtype), grouped.logical_shape}; - nvte_set_grouped_tensor_param(&grouped.handle, kNVTEGroupedRowwiseData, &data_tensor); + NVTEBasicTensor data_tensor{grouped.data.get(), static_cast(dtype), grouped.logical_shape}; + NVTEGroupedTensor h = grouped.handle.get(); + nvte_set_grouped_tensor_param(&h, kNVTEGroupedRowwiseData, &data_tensor); const bool include_columnwise = isFp8Type(dtype) || isFp4Type(dtype); if (include_columnwise) { - NVTE_CHECK_CUDA(cudaMalloc(&grouped.columnwise_data, total_bytes)); + grouped.columnwise_data = cuda_alloc(total_bytes); for (size_t i = 0; i < num_tensors; ++i) { const size_t offset_bytes = static_cast(offsets[i]) * elem_size; - NVTE_CHECK_CUDA(cudaMemcpy(static_cast(grouped.columnwise_data) + offset_bytes, + NVTE_CHECK_CUDA(cudaMemcpy(static_cast(grouped.columnwise_data.get()) + offset_bytes, tensors[i]->columnwise_dptr(), grouped.tensor_bytes[i], cudaMemcpyDeviceToDevice)); } - NVTEBasicTensor col_tensor{grouped.columnwise_data, + NVTEBasicTensor col_tensor{grouped.columnwise_data.get(), static_cast(dtype), grouped.logical_shape}; - nvte_set_grouped_tensor_param(&grouped.handle, kNVTEGroupedColumnwiseData, &col_tensor); + nvte_set_grouped_tensor_param(&h, kNVTEGroupedColumnwiseData, &col_tensor); } if (!same_first) { - NVTE_CHECK_CUDA(cudaMalloc(&grouped.first_dims_dev, num_tensors * sizeof(int64_t))); - NVTE_CHECK_CUDA(cudaMemcpy(grouped.first_dims_dev, first_dims.data(), + grouped.first_dims_dev = cuda_alloc(num_tensors * sizeof(int64_t)); + NVTE_CHECK_CUDA(cudaMemcpy(grouped.first_dims_dev.get(), first_dims.data(), num_tensors * sizeof(int64_t), cudaMemcpyHostToDevice)); NVTEShape fd_shape = nvte_make_shape(&num_tensors, 1); - NVTEBasicTensor fd_tensor{grouped.first_dims_dev, kNVTEInt64, fd_shape}; - nvte_set_grouped_tensor_param(&grouped.handle, kNVTEGroupedFirstDims, &fd_tensor); + NVTEBasicTensor fd_tensor{grouped.first_dims_dev.get(), kNVTEInt64, fd_shape}; + nvte_set_grouped_tensor_param(&h, kNVTEGroupedFirstDims, &fd_tensor); } if (!same_last) { - NVTE_CHECK_CUDA(cudaMalloc(&grouped.last_dims_dev, num_tensors * sizeof(int64_t))); - NVTE_CHECK_CUDA(cudaMemcpy(grouped.last_dims_dev, last_dims.data(), + grouped.last_dims_dev = cuda_alloc(num_tensors * sizeof(int64_t)); + NVTE_CHECK_CUDA(cudaMemcpy(grouped.last_dims_dev.get(), last_dims.data(), num_tensors * sizeof(int64_t), cudaMemcpyHostToDevice)); NVTEShape ld_shape = nvte_make_shape(&num_tensors, 1); - NVTEBasicTensor ld_tensor{grouped.last_dims_dev, kNVTEInt64, ld_shape}; - nvte_set_grouped_tensor_param(&grouped.handle, kNVTEGroupedLastDims, &ld_tensor); + NVTEBasicTensor ld_tensor{grouped.last_dims_dev.get(), kNVTEInt64, ld_shape}; + nvte_set_grouped_tensor_param(&h, kNVTEGroupedLastDims, &ld_tensor); } if (!same_first || !same_last) { - NVTE_CHECK_CUDA(cudaMalloc(&grouped.offsets_dev, num_tensors * sizeof(int64_t))); - NVTE_CHECK_CUDA(cudaMemcpy(grouped.offsets_dev, offsets.data(), + grouped.offsets_dev = cuda_alloc(num_tensors * sizeof(int64_t)); + NVTE_CHECK_CUDA(cudaMemcpy(grouped.offsets_dev.get(), offsets.data(), num_tensors * sizeof(int64_t), cudaMemcpyHostToDevice)); NVTEShape off_shape = nvte_make_shape(&num_tensors, 1); - NVTEBasicTensor off_tensor{grouped.offsets_dev, kNVTEInt64, off_shape}; - nvte_set_grouped_tensor_param(&grouped.handle, kNVTEGroupedTensorOffsets, &off_tensor); + NVTEBasicTensor off_tensor{grouped.offsets_dev.get(), kNVTEInt64, off_shape}; + nvte_set_grouped_tensor_param(&h, kNVTEGroupedTensorOffsets, &off_tensor); } if (isFp8Type(dtype)) { @@ -280,13 +249,13 @@ GroupedBuffers build_grouped_tensor(const std::vector& tensors, tensors[i]->to_cpu(); scale_inv_cpu[i] = tensors[i]->rowwise_cpu_scale_inv_ptr()[0]; } - NVTE_CHECK_CUDA(cudaMalloc(&grouped.scale_inv, sizeof(float) * num_tensors)); - NVTE_CHECK_CUDA(cudaMemcpy(grouped.scale_inv, scale_inv_cpu.data(), + grouped.scale_inv = cuda_alloc(sizeof(float) * num_tensors); + NVTE_CHECK_CUDA(cudaMemcpy(grouped.scale_inv.get(), scale_inv_cpu.data(), sizeof(float) * num_tensors, cudaMemcpyHostToDevice)); NVTEShape scale_shape = nvte_make_shape(&num_tensors, 1); - NVTEBasicTensor scale_tensor{grouped.scale_inv, kNVTEFloat32, scale_shape}; - nvte_set_grouped_tensor_param(&grouped.handle, kNVTEGroupedRowwiseScaleInv, &scale_tensor); - nvte_set_grouped_tensor_param(&grouped.handle, kNVTEGroupedColumnwiseScaleInv, &scale_tensor); + NVTEBasicTensor scale_tensor{grouped.scale_inv.get(), kNVTEFloat32, scale_shape}; + nvte_set_grouped_tensor_param(&h, kNVTEGroupedRowwiseScaleInv, &scale_tensor); + nvte_set_grouped_tensor_param(&h, kNVTEGroupedColumnwiseScaleInv, &scale_tensor); } return grouped; @@ -321,6 +290,7 @@ struct TestParams { bool transa; bool transb; ShapeCase shape_case; + bool use_null_c = false; // When true, pass nullptr for C (valid when beta=0) }; // Returns a vector of (M, N, K) tuples for each GEMM in the group. @@ -332,12 +302,14 @@ std::vector> make_shapes(ShapeCase scase) { case ShapeCase::kAllSame: return {{64, 64, 32}, {64, 64, 32}, {64, 64, 32}}; case ShapeCase::kSameFirst: - return {{64, 80, 32}, {64, 80, 48}, {64, 80, 64}}; + // Same M (first dim), varying N and K + return {{64, 80, 32}, {64, 96, 48}, {64, 112, 64}}; case ShapeCase::kSameLast: - return {{64, 80, 32}, {64, 80, 48}, {64, 80, 64}}; + // Same N (last dim), varying M and K + return {{64, 80, 32}, {80, 80, 48}, {96, 80, 64}}; case ShapeCase::kAllDifferent: default: - return {{64, 96, 32}, {64, 96, 48}, {64, 96, 64}}; + return {{64, 96, 32}, {80, 112, 48}, {96, 128, 64}}; } } @@ -430,9 +402,11 @@ void run_grouped_gemm_case(const TestParams& params) { for (size_t i = 0; i < num_gemms; ++i) { const auto [M, N, K] = shapes[i]; (void)K; - C_tensors.emplace_back(Tensor("C" + std::to_string(i), - std::vector{static_cast(M), static_cast(N)}, - DType::kBFloat16)); + if (!params.use_null_c) { + C_tensors.emplace_back(Tensor("C" + std::to_string(i), + std::vector{static_cast(M), static_cast(N)}, + DType::kBFloat16)); + } D_group_tensors.emplace_back(Tensor("D_group" + std::to_string(i), std::vector{static_cast(M), static_cast(N)}, DType::kBFloat16)); @@ -441,11 +415,16 @@ void run_grouped_gemm_case(const TestParams& params) { std::vector C_views, D_views; for (size_t i = 0; i < num_gemms; ++i) { - C_views.push_back(&C_tensors[i]); + if (!params.use_null_c) { + C_views.push_back(&C_tensors[i]); + } D_views.push_back(&D_group_tensors[i]); } - GroupedBuffers grouped_C = build_grouped_tensor(C_views, NVTE_DELAYED_TENSOR_SCALING); + std::optional grouped_C; + if (!params.use_null_c) { + grouped_C = build_grouped_tensor(C_views, NVTE_DELAYED_TENSOR_SCALING); + } GroupedBuffers grouped_D = build_grouped_tensor(D_views, NVTE_DELAYED_TENSOR_SCALING); Tensor alpha_tensor("alpha", std::vector{1}, DType::kFloat32); @@ -462,11 +441,11 @@ void run_grouped_gemm_case(const TestParams& params) { nvte_grouped_gemm(params.transa, params.transb, alpha_tensor.data(), - grouped_A.handle, - grouped_B.handle, + grouped_A.get_handle(), + grouped_B.get_handle(), beta_tensor.data(), - grouped_C.handle, - grouped_D.handle, + params.use_null_c ? nullptr : grouped_C->get_handle(), + grouped_D.get_handle(), setup_ws.data(), cublas_ws.data(), nullptr, @@ -482,7 +461,7 @@ void run_grouped_gemm_case(const TestParams& params) { D_multi[i].dtype()); const size_t offset_bytes = static_cast(grouped_D.offsets_host[i]) * grouped_D.elem_size; NVTE_CHECK_CUDA(cudaMemcpy(grouped_split.rowwise_dptr(), - static_cast(grouped_D.data) + offset_bytes, + static_cast(grouped_D.get_data()) + offset_bytes, grouped_D.tensor_bytes[i], cudaMemcpyDeviceToDevice)); grouped_split.to_cpu(); @@ -504,22 +483,25 @@ TEST_P(GroupedGemmTest, CompareWithMultiTensorGemm) { } std::string MakeGroupedGemmTestName(const testing::TestParamInfo& info) { - constexpr const char* kInputNames[] = {"FP8Delayed", "FP8Current", "BF16"}; + constexpr const char* kInputNames[] = {"FP8Current", "BF16"}; constexpr const char* kShapeNames[] = {"AllSame", "SameM", "SameN", "AllDiff"}; const std::string layout = std::string("ta") + (info.param.transa ? "T" : "N") + "tb" + (info.param.transb ? "T" : "N"); + const std::string null_c = info.param.use_null_c ? "_NullC" : ""; return std::string(kInputNames[static_cast(info.param.input_case)]) + "_" + - kShapeNames[static_cast(info.param.shape_case)] + "_" + layout; + kShapeNames[static_cast(info.param.shape_case)] + "_" + layout + null_c; } const std::vector kTestParams = { - {InputCase::kFP8Current, true, false, ShapeCase::kAllDifferent}, - {InputCase::kFP8Current, false, true, ShapeCase::kAllDifferent}, - {InputCase::kFP8Current, false, false, ShapeCase::kAllSame}, - {InputCase::kBF16, true, false, ShapeCase::kSameFirst}, - {InputCase::kBF16, false, true, ShapeCase::kSameLast}, - {InputCase::kBF16, false, false, ShapeCase::kAllSame}, - {InputCase::kBF16, true, true, ShapeCase::kAllDifferent}, + {InputCase::kFP8Current, true, false, ShapeCase::kAllDifferent, false}, + {InputCase::kFP8Current, false, true, ShapeCase::kAllDifferent, false}, + {InputCase::kFP8Current, false, false, ShapeCase::kAllSame, false}, + {InputCase::kBF16, true, false, ShapeCase::kSameFirst, false}, + {InputCase::kBF16, false, true, ShapeCase::kSameLast, false}, + {InputCase::kBF16, false, false, ShapeCase::kAllSame, false}, + {InputCase::kBF16, true, true, ShapeCase::kAllDifferent, false}, + // Test NULL C (valid when beta=0) + {InputCase::kBF16, false, false, ShapeCase::kAllSame, true}, }; INSTANTIATE_TEST_SUITE_P(OperatorTest, diff --git a/transformer_engine/common/gemm/cublaslt_gemm.cu b/transformer_engine/common/gemm/cublaslt_gemm.cu index 91405bd42f..9d9a5097d4 100644 --- a/transformer_engine/common/gemm/cublaslt_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_gemm.cu @@ -1190,8 +1190,7 @@ struct GroupedGemmSetupWorkspace { // Initialize from workspace buffer // Layout: all pointer arrays first (8-byte aligned), then int arrays (4-byte aligned) - static GroupedGemmSetupWorkspace from_buffers(char *setup_ws_ptr, size_t num_tensors, - size_t alignment) { + static GroupedGemmSetupWorkspace from_buffers(char *setup_ws_ptr, size_t num_tensors) { GroupedGemmSetupWorkspace ws; size_t offset = 0; const size_t ptr_size = num_tensors * sizeof(void *); @@ -1243,8 +1242,11 @@ inline void validate_grouped_gemm_inputs(const transformer_engine::GroupedTensor NVTE_CHECK(num_tensors >= 1, "Grouped GEMM: num_tensors must be at least 1"); NVTE_CHECK(inputB->num_tensors == num_tensors, "Grouped GEMM: A and B must have the same num_tensors"); - NVTE_CHECK(inputC->num_tensors == num_tensors, - "Grouped GEMM: A and C must have the same num_tensors"); + // C can be NULL (will use D as C when beta=0) + if (inputC != nullptr) { + NVTE_CHECK(inputC->num_tensors == num_tensors, + "Grouped GEMM: A and C must have the same num_tensors"); + } NVTE_CHECK(outputD->num_tensors == num_tensors, "Grouped GEMM: A and D must have the same num_tensors"); @@ -1261,8 +1263,13 @@ inline void validate_grouped_gemm_inputs(const transformer_engine::GroupedTensor }; NVTE_CHECK(is_fp8_or_16bit(inputA->dtype()) && is_fp8_or_16bit(inputB->dtype()), "Grouped GEMM inputs must be FP8, BF16, or FP16."); - NVTE_CHECK(is_output_dtype(inputC->dtype()) && is_output_dtype(outputD->dtype()), - "Grouped GEMM outputs must be BF16, FP16, or FP32."); + // Only check C dtype if C is provided + if (inputC != nullptr) { + NVTE_CHECK(is_output_dtype(inputC->dtype()), + "Grouped GEMM: C must be BF16, FP16, or FP32."); + } + NVTE_CHECK(is_output_dtype(outputD->dtype()), + "Grouped GEMM: D must be BF16, FP16, or FP32."); NVTE_CHECK(inputA->has_data() || inputA->has_columnwise_data(), "Grouped GEMM: A tensor is missing both row-wise and column-wise data"); NVTE_CHECK(inputB->has_data() || inputB->has_columnwise_data(), @@ -1273,6 +1280,7 @@ inline void validate_grouped_gemm_inputs(const transformer_engine::GroupedTensor // Mirrors the non-grouped GEMM logic for FP8 layout handling (TN-only on Hopper) and // fallback to column-wise data when row-wise is absent. struct GroupedOperandSelection { + const transformer_engine::GroupedTensor *tensor = nullptr; const char *base = nullptr; transformer_engine::DType dtype = transformer_engine::DType::kNumTypes; bool trans = false; @@ -1296,6 +1304,7 @@ inline GroupedOperandSelection select_grouped_operand(const transformer_engine:: const DType row_dtype = t->data.dtype; const DType col_dtype = t->columnwise_data.dtype; GroupedOperandSelection sel; + sel.tensor = t; sel.trans = trans; const DType rep_dtype = has_row ? row_dtype : col_dtype; @@ -1327,6 +1336,9 @@ inline GroupedOperandSelection select_grouped_operand(const transformer_engine:: // If only column-wise data is available, mirror the transpose flag (pre-transposed storage). if (!has_row && has_col) { + // On Hopper FP8, this would break TN requirement - should have been handled above + NVTE_CHECK(!is_fp8 || non_tn_fp8_ok, + "Grouped GEMM: FP8 on Hopper requires row-wise data for this transpose configuration"); sel.base = static_cast(t->columnwise_data.dptr); sel.dtype = col_dtype; sel.trans = !sel.trans; @@ -1334,10 +1346,10 @@ inline GroupedOperandSelection select_grouped_operand(const transformer_engine:: return sel; } - // Default: use row-wise data (or column-wise if row-wise absent, covered above). - sel.base = static_cast(has_row ? t->data.dptr : t->columnwise_data.dptr); - sel.dtype = has_row ? row_dtype : col_dtype; - sel.use_columnwise = !has_row && has_col; + // Default: use row-wise data (column-wise case already handled above) + sel.base = static_cast(t->data.dptr); + sel.dtype = row_dtype; + sel.use_columnwise = false; return sel; } @@ -1354,17 +1366,22 @@ inline void init_matrix_layouts(cublasLtMatrixLayoutOpaque_t &descA, cublasLtMatrixLayoutOpaque_t &descB, cublasLtMatrixLayoutOpaque_t &descC, cublasLtMatrixLayoutOpaque_t &descD, - const GroupedGemmSetupWorkspace &ws, bool transa, bool transb, - bool a_columnwise, bool b_columnwise, size_t num_tensors, - cudaDataType_t A_type, cudaDataType_t B_type, - cudaDataType_t D_type) { + const GroupedGemmSetupWorkspace &ws, + const GroupedOperandSelection &A_sel, + const GroupedOperandSelection &B_sel, + const transformer_engine::GroupedTensor *D, + size_t num_tensors) { + const cudaDataType_t A_type = get_cuda_dtype(A_sel.dtype); + const cudaDataType_t B_type = get_cuda_dtype(B_sel.dtype); + const cudaDataType_t D_type = get_cuda_dtype(D->dtype()); + // For column-major layout: leading dimension is the number of rows in storage. // If columnwise data was chosen, storage is already transposed. - int *rowa = a_columnwise ? ws.M : (transa ? ws.K : ws.M); - int *cola = a_columnwise ? ws.K : (transa ? ws.M : ws.K); + int *rowa = A_sel.use_columnwise ? ws.M : (A_sel.trans ? ws.K : ws.M); + int *cola = A_sel.use_columnwise ? ws.K : (A_sel.trans ? ws.M : ws.K); int *lda = rowa; - int *rowb = b_columnwise ? ws.N : (transb ? ws.N : ws.K); - int *colb = b_columnwise ? ws.K : (transb ? ws.K : ws.N); + int *rowb = B_sel.use_columnwise ? ws.N : (B_sel.trans ? ws.N : ws.K); + int *colb = B_sel.use_columnwise ? ws.K : (B_sel.trans ? ws.K : ws.N); int *ldb = rowb; NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descA, A_type, num_tensors, rowa, cola, lda)); @@ -1395,6 +1412,31 @@ inline void init_matmul_desc(cublasLtMatmulDescOpaque_t &matmulDesc, cublasOpera &alphabeta_batch_stride, sizeof(int64_t))); } +inline void set_fp8_scale_pointers(cublasLtMatmulDescOpaque_t &matmulDesc, + const GroupedOperandSelection &A_sel, + const GroupedOperandSelection &B_sel) { + const bool is_fp8_a = is_fp8_dtype(A_sel.dtype); + const bool is_fp8_b = is_fp8_dtype(B_sel.dtype); + if (!is_fp8_a && !is_fp8_b) return; + + if (is_fp8_a) { + void *a_scale_inv = A_sel.use_columnwise + ? A_sel.tensor->columnwise_scale_inv.dptr + : A_sel.tensor->scale_inv.dptr; + NVTE_CHECK(a_scale_inv != nullptr, "FP8 grouped GEMM: A scale_inv is required"); + NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + &matmulDesc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &a_scale_inv, sizeof(a_scale_inv))); + } + if (is_fp8_b) { + void *b_scale_inv = B_sel.use_columnwise + ? B_sel.tensor->columnwise_scale_inv.dptr + : B_sel.tensor->scale_inv.dptr; + NVTE_CHECK(b_scale_inv != nullptr, "FP8 grouped GEMM: B scale_inv is required"); + NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + &matmulDesc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &b_scale_inv, sizeof(b_scale_inv))); + } +} + // Constants for grouped GEMM workspace (declared early for use in heuristics) static constexpr size_t kGroupedGemmAlignment = 256; static constexpr size_t kGroupedGemmCublasWorkspaceSize = 32ull * 1024 * 1024; // 32 MiB @@ -1488,20 +1530,20 @@ __global__ void setup_grouped_gemm_kernel( // Launch the setup kernel to populate workspace arrays inline void launch_grouped_gemm_setup( - const GroupedGemmSetupWorkspace &ws, const transformer_engine::GroupedTensor *A, - const transformer_engine::GroupedTensor *B, const transformer_engine::GroupedTensor *C, + const GroupedGemmSetupWorkspace &ws, const GroupedOperandSelection &A_sel, + const GroupedOperandSelection &B_sel, const transformer_engine::GroupedTensor *C, const transformer_engine::GroupedTensor *D, const transformer_engine::Tensor *alpha_tensor, - const transformer_engine::Tensor *beta_tensor, const char *a_base, const char *b_base, - size_t a_elem_size, size_t b_elem_size, bool transa, bool transb, size_t num_tensors, - cudaStream_t stream) { - TensorShapeInfo A_meta = TensorShapeInfo::from_tensor(A); - TensorShapeInfo B_meta = TensorShapeInfo::from_tensor(B); + const transformer_engine::Tensor *beta_tensor, size_t num_tensors, cudaStream_t stream) { + TensorShapeInfo A_meta = TensorShapeInfo::from_tensor(A_sel.tensor); + TensorShapeInfo B_meta = TensorShapeInfo::from_tensor(B_sel.tensor); TensorShapeInfo C_meta = TensorShapeInfo::for_C(C, D); TensorShapeInfo D_meta = TensorShapeInfo::from_tensor(D); const char *c_base = static_cast(C->data.dptr); char *d_base = static_cast(D->data.dptr); + const size_t a_elem_size = transformer_engine::typeToSize(A_sel.dtype); + const size_t b_elem_size = transformer_engine::typeToSize(B_sel.dtype); const size_t c_elem_size = transformer_engine::typeToSize(C->dtype()); const size_t d_elem_size = transformer_engine::typeToSize(D->dtype()); @@ -1510,9 +1552,9 @@ inline void launch_grouped_gemm_setup( setup_grouped_gemm_kernel<<>>( ws.A_ptrs, ws.B_ptrs, ws.C_ptrs, ws.D_ptrs, ws.M, ws.N, ws.K, ws.alpha_ptrs, ws.beta_ptrs, - a_base, b_base, c_base, d_base, A_meta, B_meta, C_meta, D_meta, a_elem_size, b_elem_size, - c_elem_size, d_elem_size, static_cast(alpha_tensor->data.dptr), - static_cast(beta_tensor->data.dptr), transa, transb, num_tensors); + A_sel.base, B_sel.base, c_base, d_base, A_meta, B_meta, C_meta, D_meta, a_elem_size, + b_elem_size, c_elem_size, d_elem_size, static_cast(alpha_tensor->data.dptr), + static_cast(beta_tensor->data.dptr), A_sel.trans, B_sel.trans, num_tensors); NVTE_CHECK_CUDA(cudaGetLastError()); } @@ -1532,7 +1574,7 @@ void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, const NVT // Convert to internal types const GroupedTensor *inputA = convertNVTEGroupedTensorCheck(A); const GroupedTensor *inputB = convertNVTEGroupedTensorCheck(B); - const GroupedTensor *inputC = convertNVTEGroupedTensorCheck(C); + const GroupedTensor *inputC_raw = convertNVTEGroupedTensor(C); // Can be NULL GroupedTensor *outputD = convertNVTEGroupedTensorCheck(D); const Tensor *alpha_tensor = convertNVTETensorCheck(alpha); const Tensor *beta_tensor = convertNVTETensorCheck(beta); @@ -1540,19 +1582,16 @@ void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, const NVT Tensor *wspace_cublas = convertNVTETensor(workspace_cublas); // Validate inputs and num_tensors - validate_grouped_gemm_inputs(inputA, inputB, inputC, outputD); + validate_grouped_gemm_inputs(inputA, inputB, inputC_raw, outputD); + + // If C is NULL, use D as C (valid when beta=0, cuBLAS won't read C data) + const GroupedTensor *inputC = (inputC_raw != nullptr) ? inputC_raw : outputD; const size_t num_tensors = inputA->num_tensors; // Select operand storage (row-wise vs column-wise) and adjust transpose flags to // mirror the non-grouped GEMM logic for FP8 layout constraints. - bool transa_flag = static_cast(transa); - bool transb_flag = static_cast(transb); - const auto A_sel = select_grouped_operand(inputA, transa_flag, /*is_A=*/true); - const auto B_sel = select_grouped_operand(inputB, transb_flag, /*is_A=*/false); - transa_flag = A_sel.trans; - transb_flag = B_sel.trans; - const size_t a_elem_size = transformer_engine::typeToSize(A_sel.dtype); - const size_t b_elem_size = transformer_engine::typeToSize(B_sel.dtype); + const auto A_sel = select_grouped_operand(inputA, static_cast(transa), /*is_A=*/true); + const auto B_sel = select_grouped_operand(inputB, static_cast(transb), /*is_A=*/false); // Workspaces: setup (pointer arrays) and cuBLAS const size_t setup_workspace_size = grouped_gemm_setup_workspace_size(num_tensors); @@ -1563,65 +1602,35 @@ void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, const NVT void *cublas_workspace_ptr = validate_and_get_workspace_ptr(wspace_cublas, cublas_workspace_size, "Grouped GEMM cuBLAS workspace"); - NVTE_CHECK(cublas_workspace_ptr != nullptr, "Grouped GEMM: cuBLAS workspace pointer is null"); - auto setup_workspace = GroupedGemmSetupWorkspace::from_buffers( - static_cast(setup_workspace_ptr), num_tensors, kGroupedGemmAlignment); - launch_grouped_gemm_setup(setup_workspace, inputA, inputB, inputC, outputD, alpha_tensor, - beta_tensor, A_sel.base, B_sel.base, a_elem_size, b_elem_size, - transa_flag, transb_flag, num_tensors, stream); + static_cast(setup_workspace_ptr), num_tensors); + launch_grouped_gemm_setup(setup_workspace, A_sel, B_sel, inputC, outputD, + alpha_tensor, beta_tensor, num_tensors, stream); // Get cuBLAS handle using cublasHandleManager = detail::HandleManager; cublasLtHandle_t handle = cublasHandleManager::Instance().GetHandle(); - // Get data types - const cudaDataType_t A_type = get_cuda_dtype(A_sel.dtype); - const cudaDataType_t B_type = get_cuda_dtype(B_sel.dtype); - const cudaDataType_t D_type = get_cuda_dtype(outputD->dtype()); - // Setup cuBLAS operations - cublasOperation_t op_A = transa_flag ? CUBLAS_OP_T : CUBLAS_OP_N; - cublasOperation_t op_B = transb_flag ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasOperation_t op_A = A_sel.trans ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasOperation_t op_B = B_sel.trans ? CUBLAS_OP_T : CUBLAS_OP_N; // Create grouped matrix layouts cublasLtMatrixLayoutOpaque_t descA, descB, descC, descD; - init_matrix_layouts(descA, descB, descC, descD, setup_workspace, transa_flag, transb_flag, - A_sel.use_columnwise, B_sel.use_columnwise, num_tensors, A_type, B_type, - D_type); + init_matrix_layouts(descA, descB, descC, descD, setup_workspace, A_sel, B_sel, outputD, + num_tensors); // Create matmul descriptor cublasLtMatmulDescOpaque_t matmulDesc; init_matmul_desc(matmulDesc, op_A, op_B); - - // Set FP8 scale pointers if needed - const bool is_fp8_a = is_fp8_dtype(A_sel.dtype); - const bool is_fp8_b = is_fp8_dtype(B_sel.dtype); - if (is_fp8_a || is_fp8_b) { - // For FP8 grouped GEMM, we need to pass scale_inv pointers - // The scale_inv arrays contain one float per tensor in the group - if (is_fp8_a) { - void *a_scale_inv = - A_sel.use_columnwise ? inputA->columnwise_scale_inv.dptr : inputA->scale_inv.dptr; - NVTE_CHECK(a_scale_inv != nullptr, "FP8 grouped GEMM: A scale_inv is required"); - NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - &matmulDesc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &a_scale_inv, sizeof(a_scale_inv))); - } - if (is_fp8_b) { - void *b_scale_inv = - B_sel.use_columnwise ? inputB->columnwise_scale_inv.dptr : inputB->scale_inv.dptr; - NVTE_CHECK(b_scale_inv != nullptr, "FP8 grouped GEMM: B scale_inv is required"); - NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - &matmulDesc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &b_scale_inv, sizeof(b_scale_inv))); - } - } + set_fp8_scale_pointers(matmulDesc, A_sel, B_sel); // Compute average dimensions for heuristics // K dimension: if transa, K is A's first dim; if not, K is A's last dim int64_t avg_m_val = avg_m ? *avg_m : compute_avg_first_dim(outputD); int64_t avg_n_val = avg_n ? *avg_n : compute_avg_last_dim(outputD); int64_t avg_k_val = - avg_k ? *avg_k : (transa_flag ? compute_avg_first_dim(inputA) : compute_avg_last_dim(inputA)); + avg_k ? *avg_k : (A_sel.trans ? compute_avg_first_dim(A_sel.tensor) : compute_avg_last_dim(A_sel.tensor)); // Heuristic selection cublasLtMatmulAlgo_t algo = select_grouped_gemm_algo(handle, matmulDesc, descA, descB, descC, diff --git a/transformer_engine/common/include/transformer_engine/gemm.h b/transformer_engine/common/include/transformer_engine/gemm.h index 246fb5fefd..02cf01853d 100644 --- a/transformer_engine/common/include/transformer_engine/gemm.h +++ b/transformer_engine/common/include/transformer_engine/gemm.h @@ -239,19 +239,27 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor * Uses NVTEGroupedTensor to efficiently handle collections of tensors with contiguous * memory layout and shape metadata. * - * \param[in] transa Whether to transpose A matrices. - * \param[in] transb Whether to transpose B matrices. - * \param[in] alpha Scale multiplier for A @ B (NVTETensor with num_tensors elements, - * or single element for uniform alpha). - * \param[in] A Input grouped tensor A. - * \param[in] B Input grouped tensor B. - * \param[in] beta Scale multiplier for C (NVTETensor with num_tensors elements, - * or single element for uniform beta). - * \param[in] C Input grouped tensor C (can be NULL for beta=0). - * \param[out] D Output grouped tensor D. - * \param[in] workspace Workspace tensor for intermediate computations. - * \param[in] config Matrix multiplication configuration. - * \param[in] stream CUDA stream for the operation. + * \param[in] transa Whether to transpose A matrices. + * \param[in] transb Whether to transpose B matrices. + * \param[in] alpha Scale multiplier for A @ B (single element NVTETensor). + * \param[in] A Input grouped tensor A. + * \param[in] B Input grouped tensor B. + * \param[in] beta Scale multiplier for C (single element NVTETensor). + * \param[in] C Input grouped tensor C (can be NULL for beta=0). + * \param[out] D Output grouped tensor D. + * \param[in] workspace_setup Workspace tensor for pointer array setup. + * \param[in] workspace_cublas Workspace tensor for cuBLAS operations. + * \param[in] config Matrix multiplication configuration. + * \param[in] stream CUDA stream for the operation. + * \param[in] avg_m Optional hint for average M dimension across all matrices in the + * group. Used by cuBLASLt for algorithm selection heuristics. + * If NULL, computed automatically from D's logical shape. + * \param[in] avg_n Optional hint for average N dimension across all matrices in the + * group. Used by cuBLASLt for algorithm selection heuristics. + * If NULL, computed automatically from D's logical shape. + * \param[in] avg_k Optional hint for average K (reduction) dimension across all + * matrices in the group. Used by cuBLASLt for algorithm selection + * heuristics. If NULL, computed automatically from A's logical shape. * * Requirements: * - A, B, C (if provided), D must have the same num_tensors From 3b2fcdf3137cec31b83dc6dc0f64e2e367aa6f9b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 11 Dec 2025 10:57:26 +0000 Subject: [PATCH 10/98] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../common/gemm/cublaslt_gemm.cu | 33 +++++++++---------- 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/transformer_engine/common/gemm/cublaslt_gemm.cu b/transformer_engine/common/gemm/cublaslt_gemm.cu index 9d9a5097d4..7f2635943b 100644 --- a/transformer_engine/common/gemm/cublaslt_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_gemm.cu @@ -1265,11 +1265,9 @@ inline void validate_grouped_gemm_inputs(const transformer_engine::GroupedTensor "Grouped GEMM inputs must be FP8, BF16, or FP16."); // Only check C dtype if C is provided if (inputC != nullptr) { - NVTE_CHECK(is_output_dtype(inputC->dtype()), - "Grouped GEMM: C must be BF16, FP16, or FP32."); + NVTE_CHECK(is_output_dtype(inputC->dtype()), "Grouped GEMM: C must be BF16, FP16, or FP32."); } - NVTE_CHECK(is_output_dtype(outputD->dtype()), - "Grouped GEMM: D must be BF16, FP16, or FP32."); + NVTE_CHECK(is_output_dtype(outputD->dtype()), "Grouped GEMM: D must be BF16, FP16, or FP32."); NVTE_CHECK(inputA->has_data() || inputA->has_columnwise_data(), "Grouped GEMM: A tensor is missing both row-wise and column-wise data"); NVTE_CHECK(inputB->has_data() || inputB->has_columnwise_data(), @@ -1337,8 +1335,9 @@ inline GroupedOperandSelection select_grouped_operand(const transformer_engine:: // If only column-wise data is available, mirror the transpose flag (pre-transposed storage). if (!has_row && has_col) { // On Hopper FP8, this would break TN requirement - should have been handled above - NVTE_CHECK(!is_fp8 || non_tn_fp8_ok, - "Grouped GEMM: FP8 on Hopper requires row-wise data for this transpose configuration"); + NVTE_CHECK( + !is_fp8 || non_tn_fp8_ok, + "Grouped GEMM: FP8 on Hopper requires row-wise data for this transpose configuration"); sel.base = static_cast(t->columnwise_data.dptr); sel.dtype = col_dtype; sel.trans = !sel.trans; @@ -1369,8 +1368,7 @@ inline void init_matrix_layouts(cublasLtMatrixLayoutOpaque_t &descA, const GroupedGemmSetupWorkspace &ws, const GroupedOperandSelection &A_sel, const GroupedOperandSelection &B_sel, - const transformer_engine::GroupedTensor *D, - size_t num_tensors) { + const transformer_engine::GroupedTensor *D, size_t num_tensors) { const cudaDataType_t A_type = get_cuda_dtype(A_sel.dtype); const cudaDataType_t B_type = get_cuda_dtype(B_sel.dtype); const cudaDataType_t D_type = get_cuda_dtype(D->dtype()); @@ -1420,17 +1418,15 @@ inline void set_fp8_scale_pointers(cublasLtMatmulDescOpaque_t &matmulDesc, if (!is_fp8_a && !is_fp8_b) return; if (is_fp8_a) { - void *a_scale_inv = A_sel.use_columnwise - ? A_sel.tensor->columnwise_scale_inv.dptr - : A_sel.tensor->scale_inv.dptr; + void *a_scale_inv = A_sel.use_columnwise ? A_sel.tensor->columnwise_scale_inv.dptr + : A_sel.tensor->scale_inv.dptr; NVTE_CHECK(a_scale_inv != nullptr, "FP8 grouped GEMM: A scale_inv is required"); NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( &matmulDesc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &a_scale_inv, sizeof(a_scale_inv))); } if (is_fp8_b) { - void *b_scale_inv = B_sel.use_columnwise - ? B_sel.tensor->columnwise_scale_inv.dptr - : B_sel.tensor->scale_inv.dptr; + void *b_scale_inv = B_sel.use_columnwise ? B_sel.tensor->columnwise_scale_inv.dptr + : B_sel.tensor->scale_inv.dptr; NVTE_CHECK(b_scale_inv != nullptr, "FP8 grouped GEMM: B scale_inv is required"); NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( &matmulDesc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &b_scale_inv, sizeof(b_scale_inv))); @@ -1604,8 +1600,8 @@ void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, const NVT auto setup_workspace = GroupedGemmSetupWorkspace::from_buffers( static_cast(setup_workspace_ptr), num_tensors); - launch_grouped_gemm_setup(setup_workspace, A_sel, B_sel, inputC, outputD, - alpha_tensor, beta_tensor, num_tensors, stream); + launch_grouped_gemm_setup(setup_workspace, A_sel, B_sel, inputC, outputD, alpha_tensor, + beta_tensor, num_tensors, stream); // Get cuBLAS handle using cublasHandleManager = detail::HandleManager; @@ -1629,8 +1625,9 @@ void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, const NVT // K dimension: if transa, K is A's first dim; if not, K is A's last dim int64_t avg_m_val = avg_m ? *avg_m : compute_avg_first_dim(outputD); int64_t avg_n_val = avg_n ? *avg_n : compute_avg_last_dim(outputD); - int64_t avg_k_val = - avg_k ? *avg_k : (A_sel.trans ? compute_avg_first_dim(A_sel.tensor) : compute_avg_last_dim(A_sel.tensor)); + int64_t avg_k_val = avg_k ? *avg_k + : (A_sel.trans ? compute_avg_first_dim(A_sel.tensor) + : compute_avg_last_dim(A_sel.tensor)); // Heuristic selection cublasLtMatmulAlgo_t algo = select_grouped_gemm_algo(handle, matmulDesc, descA, descB, descC, From 5b0582bbf0fd05773242df67836ec263014d52dd Mon Sep 17 00:00:00 2001 From: Pawel Gadzinski Date: Thu, 11 Dec 2025 12:15:12 +0100 Subject: [PATCH 11/98] Grouped GEMM: per-matrix alpha/beta support - Change alpha/beta from single values to per-matrix arrays - Validate alpha/beta have exactly num_tensors elements - Update kernel to index alpha_ptr[idx] and beta_ptr[idx] - Move alpha/beta validation to validate_grouped_gemm_inputs - Update tests to use per-matrix alpha/beta arrays - Update documentation Signed-off-by: Piotr Gadzinski Signed-off-by: Pawel Gadzinski --- tests/cpp/operator/test_grouped_gemm.cu | 15 +++++++----- .../common/gemm/cublaslt_gemm.cu | 24 ++++++++++++++----- .../common/include/transformer_engine/gemm.h | 4 ++-- 3 files changed, 29 insertions(+), 14 deletions(-) diff --git a/tests/cpp/operator/test_grouped_gemm.cu b/tests/cpp/operator/test_grouped_gemm.cu index 5e5144fa4c..82b5bd3803 100644 --- a/tests/cpp/operator/test_grouped_gemm.cu +++ b/tests/cpp/operator/test_grouped_gemm.cu @@ -427,12 +427,15 @@ void run_grouped_gemm_case(const TestParams& params) { } GroupedBuffers grouped_D = build_grouped_tensor(D_views, NVTE_DELAYED_TENSOR_SCALING); - Tensor alpha_tensor("alpha", std::vector{1}, DType::kFloat32); - Tensor beta_tensor("beta", std::vector{1}, DType::kFloat32); - const float alpha_val = 1.f; - const float beta_val = 0.f; - NVTE_CHECK_CUDA(cudaMemcpy(alpha_tensor.rowwise_dptr(), &alpha_val, sizeof(float), cudaMemcpyHostToDevice)); - NVTE_CHECK_CUDA(cudaMemcpy(beta_tensor.rowwise_dptr(), &beta_val, sizeof(float), cudaMemcpyHostToDevice)); + // Per-matrix alpha/beta (all 1.0 and 0.0 respectively) + Tensor alpha_tensor("alpha", std::vector{num_gemms}, DType::kFloat32); + Tensor beta_tensor("beta", std::vector{num_gemms}, DType::kFloat32); + std::vector alpha_vals(num_gemms, 1.f); + std::vector beta_vals(num_gemms, 0.f); + NVTE_CHECK_CUDA(cudaMemcpy(alpha_tensor.rowwise_dptr(), alpha_vals.data(), + num_gemms * sizeof(float), cudaMemcpyHostToDevice)); + NVTE_CHECK_CUDA(cudaMemcpy(beta_tensor.rowwise_dptr(), beta_vals.data(), + num_gemms * sizeof(float), cudaMemcpyHostToDevice)); const size_t setup_ws_bytes = grouped_setup_workspace_size(num_gemms); Tensor setup_ws("setup_ws", std::vector{setup_ws_bytes}, DType::kByte); diff --git a/transformer_engine/common/gemm/cublaslt_gemm.cu b/transformer_engine/common/gemm/cublaslt_gemm.cu index 7f2635943b..caa394d549 100644 --- a/transformer_engine/common/gemm/cublaslt_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_gemm.cu @@ -1237,7 +1237,9 @@ struct GroupedGemmSetupWorkspace { inline void validate_grouped_gemm_inputs(const transformer_engine::GroupedTensor *inputA, const transformer_engine::GroupedTensor *inputB, const transformer_engine::GroupedTensor *inputC, - const transformer_engine::GroupedTensor *outputD) { + const transformer_engine::GroupedTensor *outputD, + const transformer_engine::Tensor *alpha_tensor, + const transformer_engine::Tensor *beta_tensor) { const size_t num_tensors = inputA->num_tensors; NVTE_CHECK(num_tensors >= 1, "Grouped GEMM: num_tensors must be at least 1"); NVTE_CHECK(inputB->num_tensors == num_tensors, @@ -1250,6 +1252,16 @@ inline void validate_grouped_gemm_inputs(const transformer_engine::GroupedTensor NVTE_CHECK(outputD->num_tensors == num_tensors, "Grouped GEMM: A and D must have the same num_tensors"); + // Validate alpha/beta have per-matrix values + const size_t alpha_numel = alpha_tensor->data.shape.numel(); + const size_t beta_numel = beta_tensor->data.shape.numel(); + NVTE_CHECK(alpha_numel == num_tensors, + "Grouped GEMM: alpha must have num_tensors (", num_tensors, ") elements, got ", + alpha_numel); + NVTE_CHECK(beta_numel == num_tensors, + "Grouped GEMM: beta must have num_tensors (", num_tensors, ") elements, got ", + beta_numel); + auto is_fp8_or_16bit = [](transformer_engine::DType dtype) { return dtype == transformer_engine::DType::kFloat8E4M3 || dtype == transformer_engine::DType::kFloat8E5M2 || @@ -1481,7 +1493,7 @@ __global__ void setup_grouped_gemm_kernel( TensorShapeInfo A_meta, TensorShapeInfo B_meta, TensorShapeInfo C_meta, TensorShapeInfo D_meta, // Element sizes size_t a_elem_size, size_t b_elem_size, size_t c_elem_size, size_t d_elem_size, - // Alpha/beta pointers (same for all groups) + // Alpha/beta pointers (per-matrix arrays) float *alpha_ptr, float *beta_ptr, // Transpose flags bool transa, bool transb, @@ -1519,9 +1531,9 @@ __global__ void setup_grouped_gemm_kernel( K[idx] = static_cast(transa ? a_last : a_first); N[idx] = static_cast(transb ? b_last : b_first); - // Fill alpha/beta pointers (same for all groups) - alpha_ptrs[idx] = alpha_ptr; - beta_ptrs[idx] = beta_ptr; + // Fill alpha/beta pointers (per-matrix) + alpha_ptrs[idx] = alpha_ptr + idx; + beta_ptrs[idx] = beta_ptr + idx; } // Launch the setup kernel to populate workspace arrays @@ -1578,7 +1590,7 @@ void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, const NVT Tensor *wspace_cublas = convertNVTETensor(workspace_cublas); // Validate inputs and num_tensors - validate_grouped_gemm_inputs(inputA, inputB, inputC_raw, outputD); + validate_grouped_gemm_inputs(inputA, inputB, inputC_raw, outputD, alpha_tensor, beta_tensor); // If C is NULL, use D as C (valid when beta=0, cuBLAS won't read C data) const GroupedTensor *inputC = (inputC_raw != nullptr) ? inputC_raw : outputD; diff --git a/transformer_engine/common/include/transformer_engine/gemm.h b/transformer_engine/common/include/transformer_engine/gemm.h index 02cf01853d..9dfa009115 100644 --- a/transformer_engine/common/include/transformer_engine/gemm.h +++ b/transformer_engine/common/include/transformer_engine/gemm.h @@ -241,10 +241,10 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor * * \param[in] transa Whether to transpose A matrices. * \param[in] transb Whether to transpose B matrices. - * \param[in] alpha Scale multiplier for A @ B (single element NVTETensor). + * \param[in] alpha Scale multipliers for A @ B (NVTETensor with num_tensors elements). * \param[in] A Input grouped tensor A. * \param[in] B Input grouped tensor B. - * \param[in] beta Scale multiplier for C (single element NVTETensor). + * \param[in] beta Scale multipliers for C (NVTETensor with num_tensors elements). * \param[in] C Input grouped tensor C (can be NULL for beta=0). * \param[out] D Output grouped tensor D. * \param[in] workspace_setup Workspace tensor for pointer array setup. From 101766bcb15e9cd6a9df01eaa6e5b5b9d9989f40 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 11 Dec 2025 11:17:48 +0000 Subject: [PATCH 12/98] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- transformer_engine/common/gemm/cublaslt_gemm.cu | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/transformer_engine/common/gemm/cublaslt_gemm.cu b/transformer_engine/common/gemm/cublaslt_gemm.cu index caa394d549..1d63cf65cf 100644 --- a/transformer_engine/common/gemm/cublaslt_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_gemm.cu @@ -1255,12 +1255,10 @@ inline void validate_grouped_gemm_inputs(const transformer_engine::GroupedTensor // Validate alpha/beta have per-matrix values const size_t alpha_numel = alpha_tensor->data.shape.numel(); const size_t beta_numel = beta_tensor->data.shape.numel(); - NVTE_CHECK(alpha_numel == num_tensors, - "Grouped GEMM: alpha must have num_tensors (", num_tensors, ") elements, got ", - alpha_numel); - NVTE_CHECK(beta_numel == num_tensors, - "Grouped GEMM: beta must have num_tensors (", num_tensors, ") elements, got ", - beta_numel); + NVTE_CHECK(alpha_numel == num_tensors, "Grouped GEMM: alpha must have num_tensors (", num_tensors, + ") elements, got ", alpha_numel); + NVTE_CHECK(beta_numel == num_tensors, "Grouped GEMM: beta must have num_tensors (", num_tensors, + ") elements, got ", beta_numel); auto is_fp8_or_16bit = [](transformer_engine::DType dtype) { return dtype == transformer_engine::DType::kFloat8E4M3 || From 1167f7539fb91a7d8cb7de2ea252e89415967073 Mon Sep 17 00:00:00 2001 From: Pawel Gadzinski Date: Thu, 11 Dec 2025 12:25:28 +0100 Subject: [PATCH 13/98] Fix alpha/beta numel - use SimpleTensor::numel() Signed-off-by: Piotr Gadzinski Signed-off-by: Pawel Gadzinski --- transformer_engine/common/gemm/cublaslt_gemm.cu | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/transformer_engine/common/gemm/cublaslt_gemm.cu b/transformer_engine/common/gemm/cublaslt_gemm.cu index 1d63cf65cf..b8aa2a8ba3 100644 --- a/transformer_engine/common/gemm/cublaslt_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_gemm.cu @@ -1253,12 +1253,14 @@ inline void validate_grouped_gemm_inputs(const transformer_engine::GroupedTensor "Grouped GEMM: A and D must have the same num_tensors"); // Validate alpha/beta have per-matrix values - const size_t alpha_numel = alpha_tensor->data.shape.numel(); - const size_t beta_numel = beta_tensor->data.shape.numel(); - NVTE_CHECK(alpha_numel == num_tensors, "Grouped GEMM: alpha must have num_tensors (", num_tensors, - ") elements, got ", alpha_numel); - NVTE_CHECK(beta_numel == num_tensors, "Grouped GEMM: beta must have num_tensors (", num_tensors, - ") elements, got ", beta_numel); + const size_t alpha_numel = alpha_tensor->data.numel(); + const size_t beta_numel = beta_tensor->data.numel(); + NVTE_CHECK(alpha_numel == num_tensors, + "Grouped GEMM: alpha must have num_tensors (", num_tensors, ") elements, got ", + alpha_numel); + NVTE_CHECK(beta_numel == num_tensors, + "Grouped GEMM: beta must have num_tensors (", num_tensors, ") elements, got ", + beta_numel); auto is_fp8_or_16bit = [](transformer_engine::DType dtype) { return dtype == transformer_engine::DType::kFloat8E4M3 || From 00eb18662846645875c9da5edaeb37b216c8833c Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Tue, 16 Dec 2025 17:41:24 -0800 Subject: [PATCH 14/98] Einsum WIP 1 --- build_tools/build_ext.py | 6 ++ transformer_engine/jax/cpp_extensions/base.py | 12 +-- transformer_engine/jax/cpp_extensions/gemm.py | 10 +-- transformer_engine/jax/dense.py | 87 +++++++++++++------ transformer_engine/jax/sharding.py | 2 + 5 files changed, 79 insertions(+), 38 deletions(-) diff --git a/build_tools/build_ext.py b/build_tools/build_ext.py index 349858ac49..c269a29874 100644 --- a/build_tools/build_ext.py +++ b/build_tools/build_ext.py @@ -61,6 +61,12 @@ def _build_cmake(self, build_dir: Path, install_dir: Path) -> None: f"-DCMAKE_BUILD_TYPE={build_type}", f"-DCMAKE_INSTALL_PREFIX={install_dir}", ] + if bool(int(os.getenv("NVTE_USE_CCACHE", "0"))): + ccache_bin = os.getenv("NVTE_CCACHE_BIN", "ccache") + configure_command += [ + f"-DCMAKE_CXX_COMPILER_LAUNCHER={ccache_bin}", + f"-DCMAKE_CUDA_COMPILER_LAUNCHER={ccache_bin}", + ] configure_command += self.cmake_flags import pybind11 diff --git a/transformer_engine/jax/cpp_extensions/base.py b/transformer_engine/jax/cpp_extensions/base.py index 22a4b7dda4..70734ad4c4 100644 --- a/transformer_engine/jax/cpp_extensions/base.py +++ b/transformer_engine/jax/cpp_extensions/base.py @@ -207,12 +207,12 @@ def batcher(batched_args, batch_dims, *, arg1, arg2, arg3): if batch_dim is None: batch_dim = bdim batch_size = arg.shape[bdim] - elif bdim != batch_dim: - raise ValueError( - "All batched arguments must have the same batch dimension. " - f"Got batch_dims={batch_dims}" - ) - assert batch_dim is not None and batch_size is not None, "Invalid batching config!" + # elif bdim != batch_dim: + # raise ValueError( + # "All batched arguments must have the same batch dimension. " + # f"Got batch_dims={batch_dims}" + # ) + # assert batch_dim is not None and batch_size is not None, "Invalid batching config!" # Loop over batch dimension and collect results all_results = [] diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py index 55a1700838..7d44643046 100644 --- a/transformer_engine/jax/cpp_extensions/gemm.py +++ b/transformer_engine/jax/cpp_extensions/gemm.py @@ -812,11 +812,11 @@ def batcher( lhs_bdims, _, rhs_bdims, *_ = batch_dims # Validate batch dimensions - if lhs_bdims is not None or rhs_bdims is not None: - assert lhs_bdims == rhs_bdims, ( - "Batched GEMM requires matching batch dimensions, " - f"got lhs_bdims={lhs_bdims}, rhs_bdims={rhs_bdims}" - ) + # if lhs_bdims is not None or rhs_bdims is not None: + # assert lhs_bdims == rhs_bdims, ( + # "Batched GEMM requires matching batch dimensions, " + # f"got lhs_bdims={lhs_bdims}, rhs_bdims={rhs_bdims}" + # ) # Use general batcher from BasePrimitive return GemmPrimitive.batcher_impl( diff --git a/transformer_engine/jax/dense.py b/transformer_engine/jax/dense.py index c499b0651e..f941e598ae 100644 --- a/transformer_engine/jax/dense.py +++ b/transformer_engine/jax/dense.py @@ -69,6 +69,7 @@ def dense( output_axes: Tuple[str, ...] = None, collective_op_set: tex.CollectiveOpSet = tex.noop_collective_op_set, quantizer_set: QuantizerSet = noop_quantizer_set, + batch_dims : Tuple[Sequence[int], Sequence[int]] = ((), ()), ): """Perform dense layer transformation with optional quantization. @@ -109,11 +110,12 @@ def dense( output_axes, collective_op_set, quantizer_set, + batch_dims, ) return output -@partial(jax.custom_vjp, nondiff_argnums=(3, 4, 5, 6, 7, 8)) +@partial(jax.custom_vjp, nondiff_argnums=(3, 4, 5, 6, 7, 8, 10)) def _dense( x, kernel, @@ -125,6 +127,7 @@ def _dense( output_axes, collective_op_set, quantizer_set, # need to be a diff_arg for DelayedScaling state management + batch_dims, ): """Internal implementation of dense layer transformation with custom VJP. @@ -157,6 +160,7 @@ def _dense( output_axes, collective_op_set, quantizer_set, + batch_dims, ) return output @@ -172,6 +176,7 @@ def _dense_fwd_rule( output_axes, collective_op_set, quantizer_set, + batch_dims, ): """Forward pass rule for dense layer transformation. @@ -185,9 +190,9 @@ def _dense_fwd_rule( # Check supported input layout x_is_transposed = x.ndim - 1 not in x_contracting_dims k_is_transposed = kernel.ndim - 1 in k_contracting_dims - assert ( - not x_is_transposed and not k_is_transposed - ), "Dense layer only supports `NN` layout inputs, i.e. non-transposed X and Kernel." + # assert ( + # not x_is_transposed and not k_is_transposed + # ), f"Dense layer only supports `NN` layout inputs, i.e. non-transposed X and Kernel. {x_contracting_dims=},{x.ndim=},{k_contracting_dims=},{kernel.ndim=}" flatten_axis_x = -len(x_contracting_dims) flatten_axis_k = len(k_contracting_dims) - len(kernel.shape) @@ -237,6 +242,47 @@ def _dense_fwd_rule( ) return output, ctx +def dot_general_transpose_lhs(g, x, y, *, dimension_numbers, + swap_ans=False): + # from: https://github.com/google/flax/blob/main/flax/linen/fp8_ops.py#L198 + import itertools + import numpy as np + def _remaining(original, *removed_lists): + removed = set(itertools.chain(*removed_lists)) + return [i for i in original if i not in removed] + + def _ranges_like(*xs): + start = 0 + for x in xs: + x_len = len(x) + yield range(start, start + x_len) + start += x_len + + (x_contract, y_contract), (x_batch, y_batch) = dimension_numbers + x_ndim = x.ndim + x_kept = _remaining(range(x_ndim), x_contract, x_batch) + y_kept = _remaining(range(y.ndim), y_contract, y_batch) + if swap_ans: + ans_batch, ans_y, _ = _ranges_like(x_batch, y_kept, x_kept) + else: + ans_batch, _, ans_y = _ranges_like(x_batch, x_kept, y_kept) + dims = ((ans_y, y_kept), (ans_batch, y_batch)) + x_contract_sorted_by_y = list(np.take(x_contract, np.argsort(y_contract))) + out_axes = np.argsort(list(x_batch) + x_kept + x_contract_sorted_by_y) + x_bar = jax.lax.transpose( + # TODO(jberchtold): I'm ignoring the batch_dims here, do I need to explicitly use vmap or something? + tex.gemm(g, y, contracting_dims=dims[0]), + tuple(out_axes) + ) + return x_bar + +def dot_general_transpose_rhs(g, x, y, *, dimension_numbers): + (x_contract, y_contract), (x_batch, y_batch) = dimension_numbers + swapped_dimension_numbers = ((y_contract, x_contract), (y_batch, x_batch)) + y_bar = dot_general_transpose_lhs( + g, y, x, dimension_numbers=swapped_dimension_numbers, + swap_ans=True) + return y_bar def _dense_bwd_rule( contracting_dims, @@ -245,6 +291,7 @@ def _dense_bwd_rule( kernel_axes, output_axes, collective_op_set, + batch_dims, ctx, grad, ): @@ -277,35 +324,21 @@ def _dense_bwd_rule( transpose_batch_sequence=transpose_batch_sequence, ) - # GEMM NT - # k_non_contracting_dims calibrated with the shape difference of grad.ndim vs kernel.ndim - g_contracting_dim = tuple( - range(grad.ndim - len(kernel_shape) + len(fwd_k_contracting_dims), grad.ndim) - ) - # k_non_contracting_dims - k_contracting_dim = tuple( - dim for dim in range(len(kernel_shape)) if dim not in fwd_k_contracting_dims - ) + fwd_cdims = (fwd_x_contracting_dims, fwd_k_contracting_dims) + dims = (fwd_cdims, batch_dims) - dgrad = tex.gemm( + dgrad = dot_general_transpose_lhs( casted_grad.get_tensor(usage=TensorUsage.LHS), + casted_x_lhs, casted_kernel_rhs, - contracting_dims=(g_contracting_dim, k_contracting_dim), - transpose_batch_sequence=transpose_batch_sequence, - collective_op=collective_op_set.backward, + dimension_numbers=dims, ) - # GEMM TN - # x_non_contracting_dims - g_contracting_dim = x_contracting_dim = tuple( - range(0, len(x_shape) - len(fwd_x_contracting_dims)) - ) - - wgrad = tex.gemm( + wgrad = dot_general_transpose_rhs( + casted_grad.get_tensor(usage=TensorUsage.LHS), # TODO(jberchtold): should be RHS to use fused kernel for 2x layout? but would need to update dims accordingly casted_x_lhs, - casted_grad.get_tensor(usage=TensorUsage.RHS), - contracting_dims=(x_contracting_dim, g_contracting_dim), - transpose_batch_sequence=transpose_batch_sequence, + casted_kernel_rhs, + dimension_numbers=dims, ) dgrad = with_sharding_constraint_by_logical_axes(dgrad, input_axes) diff --git a/transformer_engine/jax/sharding.py b/transformer_engine/jax/sharding.py index 6cb0dd257c..01405ba87a 100644 --- a/transformer_engine/jax/sharding.py +++ b/transformer_engine/jax/sharding.py @@ -261,6 +261,8 @@ def get_mesh_axis_size(axis, mesh=None): if axis is None: return 1 + print(mesh) + assert axis in mesh.shape, f"{axis} is not a axis of the given mesh {mesh.shape}" return mesh.shape[axis] From 38defb8ec354055f0a14017d5a525e1cc911d57c Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Thu, 18 Dec 2025 08:45:19 -0800 Subject: [PATCH 15/98] Test --- transformer_engine/jax/cpp_extensions/base.py | 2 +- transformer_engine/jax/cpp_extensions/quantization.py | 2 +- transformer_engine/jax/dense.py | 9 ++------- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/transformer_engine/jax/cpp_extensions/base.py b/transformer_engine/jax/cpp_extensions/base.py index 70734ad4c4..defdce7b68 100644 --- a/transformer_engine/jax/cpp_extensions/base.py +++ b/transformer_engine/jax/cpp_extensions/base.py @@ -212,7 +212,7 @@ def batcher(batched_args, batch_dims, *, arg1, arg2, arg3): # "All batched arguments must have the same batch dimension. " # f"Got batch_dims={batch_dims}" # ) - # assert batch_dim is not None and batch_size is not None, "Invalid batching config!" + assert batch_dim is not None and batch_size is not None, "Invalid batching config!" # Loop over batch dimension and collect results all_results = [] diff --git a/transformer_engine/jax/cpp_extensions/quantization.py b/transformer_engine/jax/cpp_extensions/quantization.py index 53c6937fb4..c5d76cf28c 100644 --- a/transformer_engine/jax/cpp_extensions/quantization.py +++ b/transformer_engine/jax/cpp_extensions/quantization.py @@ -362,7 +362,7 @@ def batcher( use_rht, ): """Batch rule for quantization primitive using general batcher.""" - check_valid_batch_dims(batch_dims) + # check_valid_batch_dims(batch_dims) assert BaseDBiasQuantizePrimitive.outer_primitive is not None return BaseDBiasQuantizePrimitive.batcher_impl( diff --git a/transformer_engine/jax/dense.py b/transformer_engine/jax/dense.py index f941e598ae..62b0e054aa 100644 --- a/transformer_engine/jax/dense.py +++ b/transformer_engine/jax/dense.py @@ -69,7 +69,6 @@ def dense( output_axes: Tuple[str, ...] = None, collective_op_set: tex.CollectiveOpSet = tex.noop_collective_op_set, quantizer_set: QuantizerSet = noop_quantizer_set, - batch_dims : Tuple[Sequence[int], Sequence[int]] = ((), ()), ): """Perform dense layer transformation with optional quantization. @@ -110,12 +109,11 @@ def dense( output_axes, collective_op_set, quantizer_set, - batch_dims, ) return output -@partial(jax.custom_vjp, nondiff_argnums=(3, 4, 5, 6, 7, 8, 10)) +@partial(jax.custom_vjp, nondiff_argnums=(3, 4, 5, 6, 7, 8)) def _dense( x, kernel, @@ -127,7 +125,6 @@ def _dense( output_axes, collective_op_set, quantizer_set, # need to be a diff_arg for DelayedScaling state management - batch_dims, ): """Internal implementation of dense layer transformation with custom VJP. @@ -160,7 +157,6 @@ def _dense( output_axes, collective_op_set, quantizer_set, - batch_dims, ) return output @@ -176,7 +172,6 @@ def _dense_fwd_rule( output_axes, collective_op_set, quantizer_set, - batch_dims, ): """Forward pass rule for dense layer transformation. @@ -291,7 +286,6 @@ def _dense_bwd_rule( kernel_axes, output_axes, collective_op_set, - batch_dims, ctx, grad, ): @@ -325,6 +319,7 @@ def _dense_bwd_rule( ) fwd_cdims = (fwd_x_contracting_dims, fwd_k_contracting_dims) + batch_dims = ((), ()) # vmap is done outside dense VJP if needed dims = (fwd_cdims, batch_dims) dgrad = dot_general_transpose_lhs( From e4a80a3522b8d1b29199d807a4770ebc815ca487 Mon Sep 17 00:00:00 2001 From: Pawel Gadzinski Date: Fri, 19 Dec 2025 09:57:33 +0100 Subject: [PATCH 16/98] Refactor: move grouped GEMM to separate file and cleanup API Signed-off-by: Pawel Gadzinski --- tests/cpp/operator/test_grouped_gemm.cu | 12 +- .../common/gemm/cublaslt_gemm.cu | 549 +--------------- .../common/gemm/cublaslt_grouped_gemm.cu | 599 ++++++++++++++++++ .../common/gemm/cublaslt_grouped_gemm.cuh | 18 + .../common/include/transformer_engine/gemm.h | 12 +- 5 files changed, 635 insertions(+), 555 deletions(-) create mode 100644 transformer_engine/common/gemm/cublaslt_grouped_gemm.cu create mode 100644 transformer_engine/common/gemm/cublaslt_grouped_gemm.cuh diff --git a/tests/cpp/operator/test_grouped_gemm.cu b/tests/cpp/operator/test_grouped_gemm.cu index 82b5bd3803..0ea76946bc 100644 --- a/tests/cpp/operator/test_grouped_gemm.cu +++ b/tests/cpp/operator/test_grouped_gemm.cu @@ -4,6 +4,7 @@ * See LICENSE for license information. ************************************************************************/ +#include #include #include #include @@ -314,9 +315,12 @@ std::vector> make_shapes(ShapeCase scase) { } void run_grouped_gemm_case(const TestParams& params) { - if (params.input_case != InputCase::kBF16 && - getDeviceComputeCapability() < hopperComputeCapability) { - GTEST_SKIP() << "FP8 grouped GEMM requires Hopper or newer."; +#if CUBLAS_VERSION < 130200 + GTEST_SKIP() << "Grouped GEMM requires cuBLAS 13.2+, but compile-time cuBLAS version is " + << CUBLAS_VERSION << "."; +#else + if (getDeviceComputeCapability() < hopperComputeCapability) { + GTEST_SKIP() << "Grouped GEMM requires Hopper (SM90) or newer."; } const std::vector> shapes = make_shapes(params.shape_case); @@ -451,7 +455,6 @@ void run_grouped_gemm_case(const TestParams& params) { grouped_D.get_handle(), setup_ws.data(), cublas_ws.data(), - nullptr, 0, nullptr, nullptr, @@ -477,6 +480,7 @@ void run_grouped_gemm_case(const TestParams& params) { atol, rtol); } +#endif // CUBLAS_VERSION >= 130200 } class GroupedGemmTest : public ::testing::TestWithParam {}; diff --git a/transformer_engine/common/gemm/cublaslt_gemm.cu b/transformer_engine/common/gemm/cublaslt_gemm.cu index b8aa2a8ba3..86f517af7d 100644 --- a/transformer_engine/common/gemm/cublaslt_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_gemm.cu @@ -23,6 +23,7 @@ #include "../util/logging.h" #include "../util/multi_stream.h" #include "./config.h" +#include "./cublaslt_grouped_gemm.cuh" #include "./cutlass_grouped_gemm.cuh" namespace { @@ -1104,551 +1105,3 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor cublas_path(); } } - -// Helper struct to pass per-tensor shape/offset info (pointer or uniform value) -struct TensorShapeInfo { - const int64_t *first_dims; // nullptr if uniform - const int64_t *last_dims; // nullptr if uniform - const int64_t *offsets; // nullptr if need to compute - int64_t uniform_first; // used if first_dims == nullptr - int64_t uniform_last; // used if last_dims == nullptr - - // Create from GroupedTensor - static TensorShapeInfo from_tensor(const transformer_engine::GroupedTensor *t) { - const bool has_first = t->first_dims.has_data(); - const bool has_last = t->last_dims.has_data(); - // When per-tensor dims are not provided, we must be in the uniform-shape case. - NVTE_CHECK(has_first || t->all_same_first_dim(), - "GroupedTensor is missing first_dims for varying shapes"); - NVTE_CHECK(has_last || t->all_same_last_dim(), - "GroupedTensor is missing last_dims for varying shapes"); - - const int64_t *first_ptr = - has_first ? static_cast(t->first_dims.dptr) : nullptr; - const int64_t *last_ptr = has_last ? static_cast(t->last_dims.dptr) : nullptr; - - const int64_t uniform_first = has_first ? 0 : static_cast(t->get_common_first_dim()); - const int64_t uniform_last = has_last ? 0 : static_cast(t->get_common_last_dim()); - - return {first_ptr, last_ptr, - t->tensor_offsets.has_data() ? static_cast(t->tensor_offsets.dptr) - : nullptr, - uniform_first, uniform_last}; - } - - // Create for C tensor (uses D's dimensions, only has offsets) - static TensorShapeInfo for_C(const transformer_engine::GroupedTensor *C, - const transformer_engine::GroupedTensor *D) { - const bool has_first = D->first_dims.has_data(); - const bool has_last = D->last_dims.has_data(); - NVTE_CHECK(has_first || D->all_same_first_dim(), - "GroupedTensor D is missing first_dims for varying shapes"); - NVTE_CHECK(has_last || D->all_same_last_dim(), - "GroupedTensor D is missing last_dims for varying shapes"); - - const int64_t *first_ptr = - has_first ? static_cast(D->first_dims.dptr) : nullptr; - const int64_t *last_ptr = has_last ? static_cast(D->last_dims.dptr) : nullptr; - const int64_t uniform_first = has_first ? 0 : static_cast(D->get_common_first_dim()); - const int64_t uniform_last = has_last ? 0 : static_cast(D->get_common_last_dim()); - - return {first_ptr, last_ptr, - C->tensor_offsets.has_data() ? static_cast(C->tensor_offsets.dptr) - : nullptr, - uniform_first, uniform_last}; - } -}; - -// Helper functions to compute average dimensions from logical_shape for heuristics -// These are hints for cuBLASLt algorithm selection, don't need to be exact -inline int64_t compute_avg_first_dim(const transformer_engine::GroupedTensor *t) { - // logical_shape[0] is either num_tensors*M (uniform) or sum_of_M (varying first) - // In both cases, dividing by num_tensors gives the average - return static_cast(t->logical_shape.data[0]) / static_cast(t->num_tensors); -} - -inline int64_t compute_avg_last_dim(const transformer_engine::GroupedTensor *t) { - if (t->all_same_last_dim()) { - // logical_shape[1] is the common N - return static_cast(t->logical_shape.data[1]); - } - // When varying, logical_shape[1] should be sum of last dims if provided; otherwise fallback to avg via division. - return static_cast(t->logical_shape.data[1]) / static_cast(t->num_tensors); -} - -// Workspace layout for grouped GEMM -struct GroupedGemmSetupWorkspace { - void **A_ptrs; - void **B_ptrs; - void **C_ptrs; - void **D_ptrs; - int *M; - int *N; - int *K; - float **alpha_ptrs; - float **beta_ptrs; - - // Initialize from workspace buffer - // Layout: all pointer arrays first (8-byte aligned), then int arrays (4-byte aligned) - static GroupedGemmSetupWorkspace from_buffers(char *setup_ws_ptr, size_t num_tensors) { - GroupedGemmSetupWorkspace ws; - size_t offset = 0; - const size_t ptr_size = num_tensors * sizeof(void *); - const size_t int_size = num_tensors * sizeof(int); - - // Pointer arrays first (all 8-byte aligned) - ws.A_ptrs = reinterpret_cast(setup_ws_ptr + offset); - offset += ptr_size; - ws.B_ptrs = reinterpret_cast(setup_ws_ptr + offset); - offset += ptr_size; - ws.C_ptrs = reinterpret_cast(setup_ws_ptr + offset); - offset += ptr_size; - ws.D_ptrs = reinterpret_cast(setup_ws_ptr + offset); - offset += ptr_size; - ws.alpha_ptrs = reinterpret_cast(setup_ws_ptr + offset); - offset += ptr_size; - ws.beta_ptrs = reinterpret_cast(setup_ws_ptr + offset); - offset += ptr_size; - - // Int arrays last (4-byte aligned, always satisfied after pointer arrays) - ws.M = reinterpret_cast(setup_ws_ptr + offset); - offset += int_size; - ws.N = reinterpret_cast(setup_ws_ptr + offset); - offset += int_size; - ws.K = reinterpret_cast(setup_ws_ptr + offset); - - return ws; - } - - // Calculate required size for setup workspace (pointer arrays + M/N/K) - static size_t required_setup_size(size_t num_tensors, size_t alignment) { - const size_t ptr_size = num_tensors * sizeof(void *); - const size_t int_size = num_tensors * sizeof(int); - // Layout: 6 ptr arrays, then 3 int arrays (no padding needed) - size_t size = 6 * ptr_size + 3 * int_size; - size = ((size + alignment - 1) / alignment) * alignment; - return size; - } -}; - -// ----------------------------------------------------------------------------- -// Helper routines to keep nvte_grouped_gemm readable -// ----------------------------------------------------------------------------- -inline void validate_grouped_gemm_inputs(const transformer_engine::GroupedTensor *inputA, - const transformer_engine::GroupedTensor *inputB, - const transformer_engine::GroupedTensor *inputC, - const transformer_engine::GroupedTensor *outputD, - const transformer_engine::Tensor *alpha_tensor, - const transformer_engine::Tensor *beta_tensor) { - const size_t num_tensors = inputA->num_tensors; - NVTE_CHECK(num_tensors >= 1, "Grouped GEMM: num_tensors must be at least 1"); - NVTE_CHECK(inputB->num_tensors == num_tensors, - "Grouped GEMM: A and B must have the same num_tensors"); - // C can be NULL (will use D as C when beta=0) - if (inputC != nullptr) { - NVTE_CHECK(inputC->num_tensors == num_tensors, - "Grouped GEMM: A and C must have the same num_tensors"); - } - NVTE_CHECK(outputD->num_tensors == num_tensors, - "Grouped GEMM: A and D must have the same num_tensors"); - - // Validate alpha/beta have per-matrix values - const size_t alpha_numel = alpha_tensor->data.numel(); - const size_t beta_numel = beta_tensor->data.numel(); - NVTE_CHECK(alpha_numel == num_tensors, - "Grouped GEMM: alpha must have num_tensors (", num_tensors, ") elements, got ", - alpha_numel); - NVTE_CHECK(beta_numel == num_tensors, - "Grouped GEMM: beta must have num_tensors (", num_tensors, ") elements, got ", - beta_numel); - - auto is_fp8_or_16bit = [](transformer_engine::DType dtype) { - return dtype == transformer_engine::DType::kFloat8E4M3 || - dtype == transformer_engine::DType::kFloat8E5M2 || - dtype == transformer_engine::DType::kBFloat16 || - dtype == transformer_engine::DType::kFloat16; - }; - auto is_output_dtype = [](transformer_engine::DType dtype) { - return dtype == transformer_engine::DType::kBFloat16 || - dtype == transformer_engine::DType::kFloat16 || - dtype == transformer_engine::DType::kFloat32; - }; - NVTE_CHECK(is_fp8_or_16bit(inputA->dtype()) && is_fp8_or_16bit(inputB->dtype()), - "Grouped GEMM inputs must be FP8, BF16, or FP16."); - // Only check C dtype if C is provided - if (inputC != nullptr) { - NVTE_CHECK(is_output_dtype(inputC->dtype()), "Grouped GEMM: C must be BF16, FP16, or FP32."); - } - NVTE_CHECK(is_output_dtype(outputD->dtype()), "Grouped GEMM: D must be BF16, FP16, or FP32."); - NVTE_CHECK(inputA->has_data() || inputA->has_columnwise_data(), - "Grouped GEMM: A tensor is missing both row-wise and column-wise data"); - NVTE_CHECK(inputB->has_data() || inputB->has_columnwise_data(), - "Grouped GEMM: B tensor is missing both row-wise and column-wise data"); -} - -// Select row-wise vs column-wise storage and adjust transpose flag for grouped GEMM. -// Mirrors the non-grouped GEMM logic for FP8 layout handling (TN-only on Hopper) and -// fallback to column-wise data when row-wise is absent. -struct GroupedOperandSelection { - const transformer_engine::GroupedTensor *tensor = nullptr; - const char *base = nullptr; - transformer_engine::DType dtype = transformer_engine::DType::kNumTypes; - bool trans = false; - bool use_columnwise = false; -}; - -inline GroupedOperandSelection select_grouped_operand(const transformer_engine::GroupedTensor *t, - bool trans, bool is_A) { - using namespace transformer_engine; - const bool has_row = t->has_data(); - const bool has_col = t->has_columnwise_data(); - NVTE_CHECK(has_row || has_col, - "Grouped GEMM operand is missing both row-wise and column-wise data"); - - // Not yet supported in grouped GEMM: block scaling, MXFP8, NVFP4 specialized layouts. - const auto sm = t->scaling_mode; - NVTE_CHECK(sm != NVTE_BLOCK_SCALING_1D && sm != NVTE_BLOCK_SCALING_2D && !is_mxfp_scaling(sm) && - !is_nvfp_scaling(sm), - "Grouped GEMM does not yet support NVFP4/MXFP8/block scaling operand selection"); - - const DType row_dtype = t->data.dtype; - const DType col_dtype = t->columnwise_data.dtype; - GroupedOperandSelection sel; - sel.tensor = t; - sel.trans = trans; - - const DType rep_dtype = has_row ? row_dtype : col_dtype; - const bool is_fp8 = is_fp8_dtype(rep_dtype); - const bool non_tn_fp8_ok = nvte_is_non_tn_fp8_gemm_supported(); - - // Hopper-style TN-only FP8: force TN by switching layout and flipping transpose when needed. - if (is_fp8 && !non_tn_fp8_ok) { - if (is_A) { - if (!sel.trans) { - NVTE_CHECK(has_col, "Grouped GEMM: A is missing column-wise data needed for FP8 TN layout"); - sel.base = static_cast(t->columnwise_data.dptr); - sel.dtype = col_dtype; - sel.trans = true; // using pre-transposed storage - sel.use_columnwise = true; - return sel; - } - } else { // B - if (sel.trans) { - NVTE_CHECK(has_col, "Grouped GEMM: B is missing column-wise data needed for FP8 TN layout"); - sel.base = static_cast(t->columnwise_data.dptr); - sel.dtype = col_dtype; - sel.trans = false; // using pre-transposed storage - sel.use_columnwise = true; - return sel; - } - } - } - - // If only column-wise data is available, mirror the transpose flag (pre-transposed storage). - if (!has_row && has_col) { - // On Hopper FP8, this would break TN requirement - should have been handled above - NVTE_CHECK( - !is_fp8 || non_tn_fp8_ok, - "Grouped GEMM: FP8 on Hopper requires row-wise data for this transpose configuration"); - sel.base = static_cast(t->columnwise_data.dptr); - sel.dtype = col_dtype; - sel.trans = !sel.trans; - sel.use_columnwise = true; - return sel; - } - - // Default: use row-wise data (column-wise case already handled above) - sel.base = static_cast(t->data.dptr); - sel.dtype = row_dtype; - sel.use_columnwise = false; - return sel; -} - -inline void *validate_and_get_workspace_ptr(transformer_engine::Tensor *ws, size_t required_size, - const char *workspace_name) { - NVTE_CHECK(ws != nullptr, workspace_name, " tensor is null."); - const size_t provided_size = get_buffer_size_bytes(ws->data.numel(), ws->data.dtype); - NVTE_CHECK(provided_size >= required_size, "Grouped GEMM: Insufficient ", workspace_name, - ". Required: ", required_size, " bytes, Available: ", provided_size, " bytes."); - return ws->data.dptr; -} - -inline void init_matrix_layouts(cublasLtMatrixLayoutOpaque_t &descA, - cublasLtMatrixLayoutOpaque_t &descB, - cublasLtMatrixLayoutOpaque_t &descC, - cublasLtMatrixLayoutOpaque_t &descD, - const GroupedGemmSetupWorkspace &ws, - const GroupedOperandSelection &A_sel, - const GroupedOperandSelection &B_sel, - const transformer_engine::GroupedTensor *D, size_t num_tensors) { - const cudaDataType_t A_type = get_cuda_dtype(A_sel.dtype); - const cudaDataType_t B_type = get_cuda_dtype(B_sel.dtype); - const cudaDataType_t D_type = get_cuda_dtype(D->dtype()); - - // For column-major layout: leading dimension is the number of rows in storage. - // If columnwise data was chosen, storage is already transposed. - int *rowa = A_sel.use_columnwise ? ws.M : (A_sel.trans ? ws.K : ws.M); - int *cola = A_sel.use_columnwise ? ws.K : (A_sel.trans ? ws.M : ws.K); - int *lda = rowa; - int *rowb = B_sel.use_columnwise ? ws.N : (B_sel.trans ? ws.N : ws.K); - int *colb = B_sel.use_columnwise ? ws.K : (B_sel.trans ? ws.K : ws.N); - int *ldb = rowb; - - NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descA, A_type, num_tensors, rowa, cola, lda)); - NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descB, B_type, num_tensors, rowb, colb, ldb)); - NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descC, D_type, num_tensors, ws.M, ws.N, ws.M)); - NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descD, D_type, num_tensors, ws.M, ws.N, ws.M)); -} - -inline void init_matmul_desc(cublasLtMatmulDescOpaque_t &matmulDesc, cublasOperation_t op_A, - cublasOperation_t op_B) { - NVTE_CHECK_CUBLAS(cublasLtMatmulDescInit(&matmulDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F)); - - NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(&matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &op_A, - sizeof(op_A))); - NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(&matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &op_B, - sizeof(op_B))); - - cublasLtPointerMode_t pointer_mode = CUBLASLT_POINTER_MODE_DEVICE; - NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(&matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, - &pointer_mode, sizeof(pointer_mode))); - - int64_t alphabeta_batch_stride = 1; - NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(&matmulDesc, - CUBLASLT_MATMUL_DESC_ALPHA_BATCH_STRIDE, - &alphabeta_batch_stride, sizeof(int64_t))); - NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(&matmulDesc, - CUBLASLT_MATMUL_DESC_BETA_BATCH_STRIDE, - &alphabeta_batch_stride, sizeof(int64_t))); -} - -inline void set_fp8_scale_pointers(cublasLtMatmulDescOpaque_t &matmulDesc, - const GroupedOperandSelection &A_sel, - const GroupedOperandSelection &B_sel) { - const bool is_fp8_a = is_fp8_dtype(A_sel.dtype); - const bool is_fp8_b = is_fp8_dtype(B_sel.dtype); - if (!is_fp8_a && !is_fp8_b) return; - - if (is_fp8_a) { - void *a_scale_inv = A_sel.use_columnwise ? A_sel.tensor->columnwise_scale_inv.dptr - : A_sel.tensor->scale_inv.dptr; - NVTE_CHECK(a_scale_inv != nullptr, "FP8 grouped GEMM: A scale_inv is required"); - NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - &matmulDesc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &a_scale_inv, sizeof(a_scale_inv))); - } - if (is_fp8_b) { - void *b_scale_inv = B_sel.use_columnwise ? B_sel.tensor->columnwise_scale_inv.dptr - : B_sel.tensor->scale_inv.dptr; - NVTE_CHECK(b_scale_inv != nullptr, "FP8 grouped GEMM: B scale_inv is required"); - NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - &matmulDesc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &b_scale_inv, sizeof(b_scale_inv))); - } -} - -// Constants for grouped GEMM workspace (declared early for use in heuristics) -static constexpr size_t kGroupedGemmAlignment = 256; -static constexpr size_t kGroupedGemmCublasWorkspaceSize = 32ull * 1024 * 1024; // 32 MiB - -inline cublasLtMatmulAlgo_t select_grouped_gemm_algo(cublasLtHandle_t handle, - cublasLtMatmulDescOpaque_t &matmulDesc, - cublasLtMatrixLayoutOpaque_t &descA, - cublasLtMatrixLayoutOpaque_t &descB, - cublasLtMatrixLayoutOpaque_t &descC, - cublasLtMatrixLayoutOpaque_t &descD, - int64_t avg_m, int64_t avg_n, int64_t avg_k) { - cublasLtMatmulPreferenceOpaque_t preference; - NVTE_CHECK_CUBLAS(cublasLtMatmulPreferenceInit(&preference)); - NVTE_CHECK_CUBLAS( - cublasLtMatmulPreferenceSetAttribute(&preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, - &kGroupedGemmCublasWorkspaceSize, sizeof(size_t))); - NVTE_CHECK_CUBLAS(cublasLtMatmulPreferenceSetAttribute( - &preference, CUBLASLT_MATMUL_PREF_GROUPED_DESC_D_AVERAGE_ROWS, &avg_m, sizeof(int64_t))); - NVTE_CHECK_CUBLAS(cublasLtMatmulPreferenceSetAttribute( - &preference, CUBLASLT_MATMUL_PREF_GROUPED_DESC_D_AVERAGE_COLS, &avg_n, sizeof(int64_t))); - NVTE_CHECK_CUBLAS(cublasLtMatmulPreferenceSetAttribute( - &preference, CUBLASLT_MATMUL_PREF_GROUPED_AVERAGE_REDUCTION_DIM, &avg_k, sizeof(int64_t))); - - cublasLtMatmulHeuristicResult_t heuristicResult; - int returnedResults = 0; - auto status = cublasLtMatmulAlgoGetHeuristic(handle, &matmulDesc, &descA, &descB, &descC, &descD, - &preference, 1, &heuristicResult, &returnedResults); - NVTE_CHECK(status != CUBLAS_STATUS_NOT_SUPPORTED, - "Unable to find suitable cuBLAS grouped GEMM algorithm"); - NVTE_CHECK_CUBLAS(status); - NVTE_CHECK(returnedResults > 0, "No suitable algorithm found for grouped GEMM"); - return heuristicResult.algo; -} - -// Single kernel that sets up all GEMM parameters. -// Rationale: cuBLASLt grouped matmul API needs flat arrays of pointers and per-matrix M/N/K, -// but NVTEGroupedTensor stores a single contiguous buffer + optional per-tensor offsets/shapes. -// We bridge the mismatch on GPU by computing per-group pointers and dims in one kernel. -__global__ void setup_grouped_gemm_kernel( - // Output arrays - void **A_ptrs, void **B_ptrs, void **C_ptrs, void **D_ptrs, int *M, int *N, int *K, - float **alpha_ptrs, float **beta_ptrs, - // Base pointers - const char *a_base, const char *b_base, const char *c_base, char *d_base, - // Dimension info (per tensor) - TensorShapeInfo A_meta, TensorShapeInfo B_meta, TensorShapeInfo C_meta, TensorShapeInfo D_meta, - // Element sizes - size_t a_elem_size, size_t b_elem_size, size_t c_elem_size, size_t d_elem_size, - // Alpha/beta pointers (per-matrix arrays) - float *alpha_ptr, float *beta_ptr, - // Transpose flags - bool transa, bool transb, - // Number of tensors - size_t num_tensors) { - size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= num_tensors) return; - - // Get dimensions for this tensor (from array or uniform value) - int64_t a_first = A_meta.first_dims ? A_meta.first_dims[idx] : A_meta.uniform_first; - int64_t a_last = A_meta.last_dims ? A_meta.last_dims[idx] : A_meta.uniform_last; - int64_t b_first = B_meta.first_dims ? B_meta.first_dims[idx] : B_meta.uniform_first; - int64_t b_last = B_meta.last_dims ? B_meta.last_dims[idx] : B_meta.uniform_last; - - // Compute offsets (from array or compute from uniform dims) - int64_t a_offset = - A_meta.offsets ? A_meta.offsets[idx] : (idx * A_meta.uniform_first * A_meta.uniform_last); - int64_t b_offset = - B_meta.offsets ? B_meta.offsets[idx] : (idx * B_meta.uniform_first * B_meta.uniform_last); - int64_t c_offset = - C_meta.offsets ? C_meta.offsets[idx] : (idx * C_meta.uniform_first * C_meta.uniform_last); - int64_t d_offset = - D_meta.offsets ? D_meta.offsets[idx] : (idx * D_meta.uniform_first * D_meta.uniform_last); - - // Compute data pointers - A_ptrs[idx] = const_cast(a_base) + a_offset * a_elem_size; - B_ptrs[idx] = const_cast(b_base) + b_offset * b_elem_size; - C_ptrs[idx] = const_cast(c_base) + c_offset * c_elem_size; - D_ptrs[idx] = d_base + d_offset * d_elem_size; - - // Compute M, N, K dimensions - // Test stores A as {K,M} when !transa, {M,K} when transa - // Test stores B as {N,K} when !transb, {K,N} when transb - M[idx] = static_cast(transa ? a_first : a_last); - K[idx] = static_cast(transa ? a_last : a_first); - N[idx] = static_cast(transb ? b_last : b_first); - - // Fill alpha/beta pointers (per-matrix) - alpha_ptrs[idx] = alpha_ptr + idx; - beta_ptrs[idx] = beta_ptr + idx; -} - -// Launch the setup kernel to populate workspace arrays -inline void launch_grouped_gemm_setup( - const GroupedGemmSetupWorkspace &ws, const GroupedOperandSelection &A_sel, - const GroupedOperandSelection &B_sel, const transformer_engine::GroupedTensor *C, - const transformer_engine::GroupedTensor *D, const transformer_engine::Tensor *alpha_tensor, - const transformer_engine::Tensor *beta_tensor, size_t num_tensors, cudaStream_t stream) { - TensorShapeInfo A_meta = TensorShapeInfo::from_tensor(A_sel.tensor); - TensorShapeInfo B_meta = TensorShapeInfo::from_tensor(B_sel.tensor); - TensorShapeInfo C_meta = TensorShapeInfo::for_C(C, D); - TensorShapeInfo D_meta = TensorShapeInfo::from_tensor(D); - - const char *c_base = static_cast(C->data.dptr); - char *d_base = static_cast(D->data.dptr); - - const size_t a_elem_size = transformer_engine::typeToSize(A_sel.dtype); - const size_t b_elem_size = transformer_engine::typeToSize(B_sel.dtype); - const size_t c_elem_size = transformer_engine::typeToSize(C->dtype()); - const size_t d_elem_size = transformer_engine::typeToSize(D->dtype()); - - const int threads_per_block = 256; - const int num_blocks = (num_tensors + threads_per_block - 1) / threads_per_block; - - setup_grouped_gemm_kernel<<>>( - ws.A_ptrs, ws.B_ptrs, ws.C_ptrs, ws.D_ptrs, ws.M, ws.N, ws.K, ws.alpha_ptrs, ws.beta_ptrs, - A_sel.base, B_sel.base, c_base, d_base, A_meta, B_meta, C_meta, D_meta, a_elem_size, - b_elem_size, c_elem_size, d_elem_size, static_cast(alpha_tensor->data.dptr), - static_cast(beta_tensor->data.dptr), A_sel.trans, B_sel.trans, num_tensors); - - NVTE_CHECK_CUDA(cudaGetLastError()); -} - -inline size_t grouped_gemm_setup_workspace_size(size_t num_tensors) { - return GroupedGemmSetupWorkspace::required_setup_size(num_tensors, kGroupedGemmAlignment); -} - -void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, const NVTEGroupedTensor A, - const NVTEGroupedTensor B, const NVTETensor beta, const NVTEGroupedTensor C, - NVTEGroupedTensor D, NVTETensor workspace_setup, NVTETensor workspace_cublas, - NVTEMatmulConfig config, cudaStream_t stream, const int64_t *avg_m, - const int64_t *avg_n, const int64_t *avg_k) { - NVTE_API_CALL(nvte_grouped_gemm); - using namespace transformer_engine; - - // Convert to internal types - const GroupedTensor *inputA = convertNVTEGroupedTensorCheck(A); - const GroupedTensor *inputB = convertNVTEGroupedTensorCheck(B); - const GroupedTensor *inputC_raw = convertNVTEGroupedTensor(C); // Can be NULL - GroupedTensor *outputD = convertNVTEGroupedTensorCheck(D); - const Tensor *alpha_tensor = convertNVTETensorCheck(alpha); - const Tensor *beta_tensor = convertNVTETensorCheck(beta); - Tensor *wspace_setup = convertNVTETensor(workspace_setup); - Tensor *wspace_cublas = convertNVTETensor(workspace_cublas); - - // Validate inputs and num_tensors - validate_grouped_gemm_inputs(inputA, inputB, inputC_raw, outputD, alpha_tensor, beta_tensor); - - // If C is NULL, use D as C (valid when beta=0, cuBLAS won't read C data) - const GroupedTensor *inputC = (inputC_raw != nullptr) ? inputC_raw : outputD; - const size_t num_tensors = inputA->num_tensors; - - // Select operand storage (row-wise vs column-wise) and adjust transpose flags to - // mirror the non-grouped GEMM logic for FP8 layout constraints. - const auto A_sel = select_grouped_operand(inputA, static_cast(transa), /*is_A=*/true); - const auto B_sel = select_grouped_operand(inputB, static_cast(transb), /*is_A=*/false); - - // Workspaces: setup (pointer arrays) and cuBLAS - const size_t setup_workspace_size = grouped_gemm_setup_workspace_size(num_tensors); - const size_t cublas_workspace_size = kGroupedGemmCublasWorkspaceSize; - - void *setup_workspace_ptr = validate_and_get_workspace_ptr(wspace_setup, setup_workspace_size, - "Grouped GEMM setup workspace"); - void *cublas_workspace_ptr = validate_and_get_workspace_ptr(wspace_cublas, cublas_workspace_size, - "Grouped GEMM cuBLAS workspace"); - - auto setup_workspace = GroupedGemmSetupWorkspace::from_buffers( - static_cast(setup_workspace_ptr), num_tensors); - launch_grouped_gemm_setup(setup_workspace, A_sel, B_sel, inputC, outputD, alpha_tensor, - beta_tensor, num_tensors, stream); - - // Get cuBLAS handle - using cublasHandleManager = detail::HandleManager; - cublasLtHandle_t handle = cublasHandleManager::Instance().GetHandle(); - - // Setup cuBLAS operations - cublasOperation_t op_A = A_sel.trans ? CUBLAS_OP_T : CUBLAS_OP_N; - cublasOperation_t op_B = B_sel.trans ? CUBLAS_OP_T : CUBLAS_OP_N; - - // Create grouped matrix layouts - cublasLtMatrixLayoutOpaque_t descA, descB, descC, descD; - init_matrix_layouts(descA, descB, descC, descD, setup_workspace, A_sel, B_sel, outputD, - num_tensors); - - // Create matmul descriptor - cublasLtMatmulDescOpaque_t matmulDesc; - init_matmul_desc(matmulDesc, op_A, op_B); - set_fp8_scale_pointers(matmulDesc, A_sel, B_sel); - - // Compute average dimensions for heuristics - // K dimension: if transa, K is A's first dim; if not, K is A's last dim - int64_t avg_m_val = avg_m ? *avg_m : compute_avg_first_dim(outputD); - int64_t avg_n_val = avg_n ? *avg_n : compute_avg_last_dim(outputD); - int64_t avg_k_val = avg_k ? *avg_k - : (A_sel.trans ? compute_avg_first_dim(A_sel.tensor) - : compute_avg_last_dim(A_sel.tensor)); - - // Heuristic selection - cublasLtMatmulAlgo_t algo = select_grouped_gemm_algo(handle, matmulDesc, descA, descB, descC, - descD, avg_m_val, avg_n_val, avg_k_val); - - // Execute the grouped GEMM - NVTE_CHECK_CUBLAS(cublasLtMatmul(handle, &matmulDesc, setup_workspace.alpha_ptrs, - setup_workspace.A_ptrs, &descA, setup_workspace.B_ptrs, &descB, - setup_workspace.beta_ptrs, setup_workspace.C_ptrs, &descC, - setup_workspace.D_ptrs, &descD, &algo, cublas_workspace_ptr, - kGroupedGemmCublasWorkspaceSize, stream)); -} diff --git a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu new file mode 100644 index 0000000000..4125bd82bf --- /dev/null +++ b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu @@ -0,0 +1,599 @@ +/************************************************************************* + * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ + +#include +#include +#include +#include +#include + +#include + +#include "../common.h" +#include "../util/cuda_runtime.h" +#include "../util/handle_manager.h" +#include "../util/logging.h" +#include "./cublaslt_grouped_gemm.cuh" + +namespace { + +inline void CreateCublasHandle(cublasLtHandle_t *handle) { + NVTE_CHECK_CUBLAS(cublasLtCreate(handle)); +} + +} // namespace + +#if CUBLAS_VERSION >= 130100 + +namespace { + +// Helper struct to pass per-tensor shape/offset info (pointer or uniform value) +struct TensorShapeInfo { + const int64_t *first_dims; // nullptr if uniform + const int64_t *last_dims; // nullptr if uniform + const int64_t *offsets; // nullptr if need to compute + int64_t uniform_first; // used if first_dims == nullptr + int64_t uniform_last; // used if last_dims == nullptr + + // Create from GroupedTensor + static TensorShapeInfo from_tensor(const transformer_engine::GroupedTensor *t) { + const bool has_first = t->first_dims.has_data(); + const bool has_last = t->last_dims.has_data(); + // When per-tensor dims are not provided, we must be in the uniform-shape case. + NVTE_CHECK(has_first || t->all_same_first_dim(), + "GroupedTensor is missing first_dims for varying shapes"); + NVTE_CHECK(has_last || t->all_same_last_dim(), + "GroupedTensor is missing last_dims for varying shapes"); + + const int64_t *first_ptr = + has_first ? static_cast(t->first_dims.dptr) : nullptr; + const int64_t *last_ptr = has_last ? static_cast(t->last_dims.dptr) : nullptr; + + const int64_t uniform_first = has_first ? 0 : static_cast(t->get_common_first_dim()); + const int64_t uniform_last = has_last ? 0 : static_cast(t->get_common_last_dim()); + + return {first_ptr, last_ptr, + t->tensor_offsets.has_data() ? static_cast(t->tensor_offsets.dptr) + : nullptr, + uniform_first, uniform_last}; + } + + // Create for C tensor (uses D's dimensions, only has offsets) + static TensorShapeInfo for_C(const transformer_engine::GroupedTensor *C, + const transformer_engine::GroupedTensor *D) { + const bool has_first = D->first_dims.has_data(); + const bool has_last = D->last_dims.has_data(); + NVTE_CHECK(has_first || D->all_same_first_dim(), + "GroupedTensor D is missing first_dims for varying shapes"); + NVTE_CHECK(has_last || D->all_same_last_dim(), + "GroupedTensor D is missing last_dims for varying shapes"); + + const int64_t *first_ptr = + has_first ? static_cast(D->first_dims.dptr) : nullptr; + const int64_t *last_ptr = has_last ? static_cast(D->last_dims.dptr) : nullptr; + const int64_t uniform_first = has_first ? 0 : static_cast(D->get_common_first_dim()); + const int64_t uniform_last = has_last ? 0 : static_cast(D->get_common_last_dim()); + + return {first_ptr, last_ptr, + C->tensor_offsets.has_data() ? static_cast(C->tensor_offsets.dptr) + : nullptr, + uniform_first, uniform_last}; + } +}; + +// Helper functions to compute average dimensions from logical_shape for heuristics +// These are hints for cuBLASLt algorithm selection, don't need to be exact +inline int64_t compute_avg_first_dim(const transformer_engine::GroupedTensor *t) { + // logical_shape[0] is either num_tensors*M (uniform) or sum_of_M (varying first) + // In both cases, dividing by num_tensors gives the average + return static_cast(t->logical_shape.data[0]) / static_cast(t->num_tensors); +} + +inline int64_t compute_avg_last_dim(const transformer_engine::GroupedTensor *t) { + if (t->all_same_last_dim()) { + // logical_shape[1] is the common N + return static_cast(t->logical_shape.data[1]); + } + // When varying, logical_shape[1] should be sum of last dims if provided; otherwise fallback to avg via division. + return static_cast(t->logical_shape.data[1]) / static_cast(t->num_tensors); +} + +// Workspace layout for grouped GEMM +struct GroupedGemmSetupWorkspace { + void **A_ptrs; + void **B_ptrs; + void **C_ptrs; + void **D_ptrs; + int *M; + int *N; + int *K; + float **alpha_ptrs; + float **beta_ptrs; + + // Initialize from workspace buffer + // Layout: all pointer arrays first (8-byte aligned), then int arrays (4-byte aligned) + static GroupedGemmSetupWorkspace from_buffers(char *setup_ws_ptr, size_t num_tensors) { + GroupedGemmSetupWorkspace ws; + size_t offset = 0; + const size_t ptr_size = num_tensors * sizeof(void *); + const size_t int_size = num_tensors * sizeof(int); + + // Pointer arrays first (all 8-byte aligned) + ws.A_ptrs = reinterpret_cast(setup_ws_ptr + offset); + offset += ptr_size; + ws.B_ptrs = reinterpret_cast(setup_ws_ptr + offset); + offset += ptr_size; + ws.C_ptrs = reinterpret_cast(setup_ws_ptr + offset); + offset += ptr_size; + ws.D_ptrs = reinterpret_cast(setup_ws_ptr + offset); + offset += ptr_size; + ws.alpha_ptrs = reinterpret_cast(setup_ws_ptr + offset); + offset += ptr_size; + ws.beta_ptrs = reinterpret_cast(setup_ws_ptr + offset); + offset += ptr_size; + + // Int arrays last (4-byte aligned, always satisfied after pointer arrays) + ws.M = reinterpret_cast(setup_ws_ptr + offset); + offset += int_size; + ws.N = reinterpret_cast(setup_ws_ptr + offset); + offset += int_size; + ws.K = reinterpret_cast(setup_ws_ptr + offset); + + return ws; + } + + // Calculate required size for setup workspace (pointer arrays + M/N/K) + static size_t required_setup_size(size_t num_tensors, size_t alignment) { + const size_t ptr_size = num_tensors * sizeof(void *); + const size_t int_size = num_tensors * sizeof(int); + // Layout: 6 ptr arrays, then 3 int arrays (no padding needed) + size_t size = 6 * ptr_size + 3 * int_size; + size = ((size + alignment - 1) / alignment) * alignment; + return size; + } +}; + +// ----------------------------------------------------------------------------- +// Helper routines to keep nvte_grouped_gemm readable +// ----------------------------------------------------------------------------- +inline void validate_grouped_gemm_inputs(const transformer_engine::GroupedTensor *inputA, + const transformer_engine::GroupedTensor *inputB, + const transformer_engine::GroupedTensor *inputC, + const transformer_engine::GroupedTensor *outputD, + const transformer_engine::Tensor *alpha_tensor, + const transformer_engine::Tensor *beta_tensor) { + const size_t num_tensors = inputA->num_tensors; + NVTE_CHECK(num_tensors >= 1, "Grouped GEMM: num_tensors must be at least 1"); + NVTE_CHECK(inputB->num_tensors == num_tensors, + "Grouped GEMM: A and B must have the same num_tensors"); + // C can be NULL (will use D as C when beta=0) + if (inputC != nullptr) { + NVTE_CHECK(inputC->num_tensors == num_tensors, + "Grouped GEMM: A and C must have the same num_tensors"); + } + NVTE_CHECK(outputD->num_tensors == num_tensors, + "Grouped GEMM: A and D must have the same num_tensors"); + + // Validate alpha/beta have per-matrix values + const size_t alpha_numel = alpha_tensor->data.numel(); + const size_t beta_numel = beta_tensor->data.numel(); + NVTE_CHECK(alpha_numel == num_tensors, + "Grouped GEMM: alpha must have num_tensors (", num_tensors, ") elements, got ", + alpha_numel); + NVTE_CHECK(beta_numel == num_tensors, + "Grouped GEMM: beta must have num_tensors (", num_tensors, ") elements, got ", + beta_numel); + + auto is_fp8_or_16bit = [](transformer_engine::DType dtype) { + return dtype == transformer_engine::DType::kFloat8E4M3 || + dtype == transformer_engine::DType::kFloat8E5M2 || + dtype == transformer_engine::DType::kBFloat16 || + dtype == transformer_engine::DType::kFloat16; + }; + auto is_output_dtype = [](transformer_engine::DType dtype) { + return dtype == transformer_engine::DType::kBFloat16 || + dtype == transformer_engine::DType::kFloat16 || + dtype == transformer_engine::DType::kFloat32; + }; + NVTE_CHECK(is_fp8_or_16bit(inputA->dtype()) && is_fp8_or_16bit(inputB->dtype()), + "Grouped GEMM inputs must be FP8, BF16, or FP16."); + // Only check C dtype if C is provided + if (inputC != nullptr) { + NVTE_CHECK(is_output_dtype(inputC->dtype()), "Grouped GEMM: C must be BF16, FP16, or FP32."); + } + NVTE_CHECK(is_output_dtype(outputD->dtype()), "Grouped GEMM: D must be BF16, FP16, or FP32."); + NVTE_CHECK(inputA->has_data() || inputA->has_columnwise_data(), + "Grouped GEMM: A tensor is missing both row-wise and column-wise data"); + NVTE_CHECK(inputB->has_data() || inputB->has_columnwise_data(), + "Grouped GEMM: B tensor is missing both row-wise and column-wise data"); +} + +// Select row-wise vs column-wise storage and adjust transpose flag for grouped GEMM. +// Mirrors the non-grouped GEMM logic for FP8 layout handling (TN-only on Hopper) and +// fallback to column-wise data when row-wise is absent. +struct GroupedOperandSelection { + const transformer_engine::GroupedTensor *tensor = nullptr; + const char *dptr = nullptr; + transformer_engine::DType dtype = transformer_engine::DType::kNumTypes; + bool trans = false; + bool use_columnwise = false; +}; + +inline GroupedOperandSelection select_grouped_operand(const transformer_engine::GroupedTensor *t, + bool trans, bool is_A) { + using namespace transformer_engine; + const bool has_row = t->has_data(); + const bool has_col = t->has_columnwise_data(); + NVTE_CHECK(has_row || has_col, + "Grouped GEMM operand is missing both row-wise and column-wise data"); + + // Currently only unquantized data and tensor-scaled FP8 are supported. + const auto sm = t->scaling_mode; + NVTE_CHECK(sm == NVTE_DELAYED_TENSOR_SCALING, + "Grouped GEMM is only supported with unquantized data and tensor-scaled FP8 data"); + + const DType row_dtype = t->data.dtype; + const DType col_dtype = t->columnwise_data.dtype; + GroupedOperandSelection sel; + sel.tensor = t; + sel.trans = trans; + + const DType rep_dtype = has_row ? row_dtype : col_dtype; + const bool is_fp8 = is_fp8_dtype(rep_dtype); + const bool non_tn_fp8_ok = nvte_is_non_tn_fp8_gemm_supported(); + + // Hopper-style TN-only FP8: force TN by switching layout and flipping transpose when needed. + if (is_fp8 && !non_tn_fp8_ok) { + if (is_A) { + if (!sel.trans) { + NVTE_CHECK(has_col, "Grouped GEMM: A is missing column-wise data needed for FP8 TN layout"); + sel.dptr = static_cast(t->columnwise_data.dptr); + sel.dtype = col_dtype; + sel.trans = true; // using pre-transposed storage + sel.use_columnwise = true; + return sel; + } + } else { // B + if (sel.trans) { + NVTE_CHECK(has_col, "Grouped GEMM: B is missing column-wise data needed for FP8 TN layout"); + sel.dptr = static_cast(t->columnwise_data.dptr); + sel.dtype = col_dtype; + sel.trans = false; // using pre-transposed storage + sel.use_columnwise = true; + return sel; + } + } + } + + // If only column-wise data is available, mirror the transpose flag (pre-transposed storage). + if (!has_row && has_col) { + // On Hopper FP8, this would break TN requirement - should have been handled above + NVTE_CHECK( + !is_fp8 || non_tn_fp8_ok, + "Grouped GEMM: FP8 on Hopper requires row-wise data for this transpose configuration"); + sel.dptr = static_cast(t->columnwise_data.dptr); + sel.dtype = col_dtype; + sel.trans = !sel.trans; + sel.use_columnwise = true; + return sel; + } + + // Default: use row-wise data (column-wise case already handled above) + sel.dptr = static_cast(t->data.dptr); + sel.dtype = row_dtype; + sel.use_columnwise = false; + return sel; +} + +inline void *validate_and_get_workspace_ptr(transformer_engine::Tensor *ws, size_t required_size, + const char *workspace_name) { + NVTE_CHECK(ws != nullptr, workspace_name, " tensor is null."); + const size_t provided_size = get_buffer_size_bytes(ws->data.numel(), ws->data.dtype); + NVTE_CHECK(provided_size >= required_size, "Grouped GEMM: Insufficient ", workspace_name, + ". Required: ", required_size, " bytes, Available: ", provided_size, " bytes."); + return ws->data.dptr; +} + +inline void init_matrix_layouts(cublasLtMatrixLayoutOpaque_t &descA, + cublasLtMatrixLayoutOpaque_t &descB, + cublasLtMatrixLayoutOpaque_t &descC, + cublasLtMatrixLayoutOpaque_t &descD, + const GroupedGemmSetupWorkspace &ws, + const GroupedOperandSelection &A_sel, + const GroupedOperandSelection &B_sel, + const transformer_engine::GroupedTensor *D, size_t num_tensors) { + const cudaDataType_t A_type = get_cuda_dtype(A_sel.dtype); + const cudaDataType_t B_type = get_cuda_dtype(B_sel.dtype); + const cudaDataType_t D_type = get_cuda_dtype(D->dtype()); + + // For column-major layout: leading dimension is the number of rows in storage. + // If columnwise data was chosen, storage is already transposed. + int *rowa = A_sel.use_columnwise ? ws.M : (A_sel.trans ? ws.K : ws.M); + int *cola = A_sel.use_columnwise ? ws.K : (A_sel.trans ? ws.M : ws.K); + int *lda = rowa; + int *rowb = B_sel.use_columnwise ? ws.N : (B_sel.trans ? ws.N : ws.K); + int *colb = B_sel.use_columnwise ? ws.K : (B_sel.trans ? ws.K : ws.N); + int *ldb = rowb; + + NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descA, A_type, num_tensors, rowa, cola, lda)); + NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descB, B_type, num_tensors, rowb, colb, ldb)); + NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descC, D_type, num_tensors, ws.M, ws.N, ws.M)); + NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descD, D_type, num_tensors, ws.M, ws.N, ws.M)); +} + +inline void init_matmul_desc(cublasLtMatmulDescOpaque_t &matmulDesc, cublasOperation_t op_A, + cublasOperation_t op_B) { + NVTE_CHECK_CUBLAS(cublasLtMatmulDescInit(&matmulDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F)); + + NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(&matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &op_A, + sizeof(op_A))); + NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(&matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &op_B, + sizeof(op_B))); + + cublasLtPointerMode_t pointer_mode = CUBLASLT_POINTER_MODE_DEVICE; + NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(&matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, + &pointer_mode, sizeof(pointer_mode))); + + int64_t alphabeta_batch_stride = 1; + NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(&matmulDesc, + CUBLASLT_MATMUL_DESC_ALPHA_BATCH_STRIDE, + &alphabeta_batch_stride, sizeof(int64_t))); + NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(&matmulDesc, + CUBLASLT_MATMUL_DESC_BETA_BATCH_STRIDE, + &alphabeta_batch_stride, sizeof(int64_t))); +} + +inline void set_fp8_scale_pointers(cublasLtMatmulDescOpaque_t &matmulDesc, + const GroupedOperandSelection &A_sel, + const GroupedOperandSelection &B_sel) { + const bool is_fp8_a = is_fp8_dtype(A_sel.dtype); + const bool is_fp8_b = is_fp8_dtype(B_sel.dtype); + if (!is_fp8_a && !is_fp8_b) return; + + if (is_fp8_a) { + void *a_scale_inv = A_sel.use_columnwise ? A_sel.tensor->columnwise_scale_inv.dptr + : A_sel.tensor->scale_inv.dptr; + NVTE_CHECK(a_scale_inv != nullptr, "FP8 grouped GEMM: A scale_inv is required"); + NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + &matmulDesc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &a_scale_inv, sizeof(a_scale_inv))); + } + if (is_fp8_b) { + void *b_scale_inv = B_sel.use_columnwise ? B_sel.tensor->columnwise_scale_inv.dptr + : B_sel.tensor->scale_inv.dptr; + NVTE_CHECK(b_scale_inv != nullptr, "FP8 grouped GEMM: B scale_inv is required"); + NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + &matmulDesc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &b_scale_inv, sizeof(b_scale_inv))); + } +} + +// Constants for grouped GEMM workspace (declared early for use in heuristics) +static constexpr size_t kGroupedGemmAlignment = 256; +static constexpr size_t kGroupedGemmCublasWorkspaceSize = 32ull * 1024 * 1024; // 32 MiB + +inline cublasLtMatmulAlgo_t select_grouped_gemm_algo(cublasLtHandle_t handle, + cublasLtMatmulDescOpaque_t &matmulDesc, + cublasLtMatrixLayoutOpaque_t &descA, + cublasLtMatrixLayoutOpaque_t &descB, + cublasLtMatrixLayoutOpaque_t &descC, + cublasLtMatrixLayoutOpaque_t &descD, + int64_t avg_m, int64_t avg_n, int64_t avg_k) { + cublasLtMatmulPreferenceOpaque_t preference; + NVTE_CHECK_CUBLAS(cublasLtMatmulPreferenceInit(&preference)); + NVTE_CHECK_CUBLAS( + cublasLtMatmulPreferenceSetAttribute(&preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, + &kGroupedGemmCublasWorkspaceSize, sizeof(size_t))); + NVTE_CHECK_CUBLAS(cublasLtMatmulPreferenceSetAttribute( + &preference, CUBLASLT_MATMUL_PREF_GROUPED_DESC_D_AVERAGE_ROWS, &avg_m, sizeof(int64_t))); + NVTE_CHECK_CUBLAS(cublasLtMatmulPreferenceSetAttribute( + &preference, CUBLASLT_MATMUL_PREF_GROUPED_DESC_D_AVERAGE_COLS, &avg_n, sizeof(int64_t))); + NVTE_CHECK_CUBLAS(cublasLtMatmulPreferenceSetAttribute( + &preference, CUBLASLT_MATMUL_PREF_GROUPED_AVERAGE_REDUCTION_DIM, &avg_k, sizeof(int64_t))); + + cublasLtMatmulHeuristicResult_t heuristicResult; + int returnedResults = 0; + auto status = cublasLtMatmulAlgoGetHeuristic(handle, &matmulDesc, &descA, &descB, &descC, &descD, + &preference, 1, &heuristicResult, &returnedResults); + NVTE_CHECK(status != CUBLAS_STATUS_NOT_SUPPORTED, + "Unable to find suitable cuBLAS grouped GEMM algorithm"); + NVTE_CHECK_CUBLAS(status); + NVTE_CHECK(returnedResults > 0, "No suitable algorithm found for grouped GEMM"); + return heuristicResult.algo; +} + +// Single kernel that sets up all GEMM parameters. +// Rationale: cuBLASLt grouped matmul API needs flat arrays of pointers and per-matrix M/N/K, +// but NVTEGroupedTensor stores a single contiguous buffer + optional per-tensor offsets/shapes. +// We bridge the mismatch on GPU by computing per-group pointers and dims in one kernel. +__global__ void setup_grouped_gemm_kernel( + // Output arrays + void **A_ptrs, void **B_ptrs, void **C_ptrs, void **D_ptrs, int *M, int *N, int *K, + float **alpha_ptrs, float **beta_ptrs, + // Base pointers + const char *a_base, const char *b_base, const char *c_base, char *d_base, + // Dimension info (per tensor) + TensorShapeInfo A_meta, TensorShapeInfo B_meta, TensorShapeInfo C_meta, TensorShapeInfo D_meta, + // Element sizes + size_t a_elem_size, size_t b_elem_size, size_t c_elem_size, size_t d_elem_size, + // Alpha/beta pointers (per-matrix arrays) + float *alpha_ptr, float *beta_ptr, + // Transpose flags + bool transa, bool transb, + // Number of tensors + size_t num_tensors) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= num_tensors) return; + + // Get dimensions for this tensor (from array or uniform value) + int64_t a_first = A_meta.first_dims ? A_meta.first_dims[idx] : A_meta.uniform_first; + int64_t a_last = A_meta.last_dims ? A_meta.last_dims[idx] : A_meta.uniform_last; + int64_t b_first = B_meta.first_dims ? B_meta.first_dims[idx] : B_meta.uniform_first; + int64_t b_last = B_meta.last_dims ? B_meta.last_dims[idx] : B_meta.uniform_last; + + // Compute offsets (from array or compute from uniform dims) + int64_t a_offset = + A_meta.offsets ? A_meta.offsets[idx] : (idx * A_meta.uniform_first * A_meta.uniform_last); + int64_t b_offset = + B_meta.offsets ? B_meta.offsets[idx] : (idx * B_meta.uniform_first * B_meta.uniform_last); + int64_t c_offset = + C_meta.offsets ? C_meta.offsets[idx] : (idx * C_meta.uniform_first * C_meta.uniform_last); + int64_t d_offset = + D_meta.offsets ? D_meta.offsets[idx] : (idx * D_meta.uniform_first * D_meta.uniform_last); + + // Compute data pointers + A_ptrs[idx] = const_cast(a_base) + a_offset * a_elem_size; + B_ptrs[idx] = const_cast(b_base) + b_offset * b_elem_size; + C_ptrs[idx] = const_cast(c_base) + c_offset * c_elem_size; + D_ptrs[idx] = d_base + d_offset * d_elem_size; + + // Compute M, N, K dimensions + // Test stores A as {K,M} when !transa, {M,K} when transa + // Test stores B as {N,K} when !transb, {K,N} when transb + M[idx] = static_cast(transa ? a_first : a_last); + K[idx] = static_cast(transa ? a_last : a_first); + N[idx] = static_cast(transb ? b_last : b_first); + + // Fill alpha/beta pointers (per-matrix) + alpha_ptrs[idx] = alpha_ptr + idx; + beta_ptrs[idx] = beta_ptr + idx; +} + +// Launch the setup kernel to populate workspace arrays +inline void launch_grouped_gemm_setup( + const GroupedGemmSetupWorkspace &ws, const GroupedOperandSelection &A_sel, + const GroupedOperandSelection &B_sel, const transformer_engine::GroupedTensor *C, + const transformer_engine::GroupedTensor *D, const transformer_engine::Tensor *alpha_tensor, + const transformer_engine::Tensor *beta_tensor, size_t num_tensors, cudaStream_t stream) { + TensorShapeInfo A_meta = TensorShapeInfo::from_tensor(A_sel.tensor); + TensorShapeInfo B_meta = TensorShapeInfo::from_tensor(B_sel.tensor); + TensorShapeInfo C_meta = TensorShapeInfo::for_C(C, D); + TensorShapeInfo D_meta = TensorShapeInfo::from_tensor(D); + + const char *c_base = static_cast(C->data.dptr); + char *d_base = static_cast(D->data.dptr); + + const size_t a_elem_size = transformer_engine::typeToSize(A_sel.dtype); + const size_t b_elem_size = transformer_engine::typeToSize(B_sel.dtype); + const size_t c_elem_size = transformer_engine::typeToSize(C->dtype()); + const size_t d_elem_size = transformer_engine::typeToSize(D->dtype()); + + const int threads_per_block = 256; + const int num_blocks = (num_tensors + threads_per_block - 1) / threads_per_block; + + setup_grouped_gemm_kernel<<>>( + ws.A_ptrs, ws.B_ptrs, ws.C_ptrs, ws.D_ptrs, ws.M, ws.N, ws.K, ws.alpha_ptrs, ws.beta_ptrs, + A_sel.dptr, B_sel.dptr, c_base, d_base, A_meta, B_meta, C_meta, D_meta, a_elem_size, + b_elem_size, c_elem_size, d_elem_size, static_cast(alpha_tensor->data.dptr), + static_cast(beta_tensor->data.dptr), A_sel.trans, B_sel.trans, num_tensors); + + NVTE_CHECK_CUDA(cudaGetLastError()); +} + +inline size_t grouped_gemm_setup_workspace_size(size_t num_tensors) { + return GroupedGemmSetupWorkspace::required_setup_size(num_tensors, kGroupedGemmAlignment); +} + +} // namespace + +void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, const NVTEGroupedTensor A, + const NVTEGroupedTensor B, const NVTETensor beta, const NVTEGroupedTensor C, + NVTEGroupedTensor D, NVTETensor workspace_setup, NVTETensor workspace_cublas, + cudaStream_t stream, const int64_t *avg_m, const int64_t *avg_n, + const int64_t *avg_k) { + NVTE_API_CALL(nvte_grouped_gemm); + using namespace transformer_engine; + + // Grouped GEMM requires Hopper (SM90) or newer + const int current_device = cuda::current_device(); + NVTE_CHECK(cuda::sm_arch(current_device) >= 90, + "nvte_grouped_gemm requires Hopper (SM90) or newer architecture."); + + // Convert to internal types + const GroupedTensor *inputA = convertNVTEGroupedTensorCheck(A); + const GroupedTensor *inputB = convertNVTEGroupedTensorCheck(B); + const GroupedTensor *inputC_raw = convertNVTEGroupedTensor(C); // Can be NULL + GroupedTensor *outputD = convertNVTEGroupedTensorCheck(D); + const Tensor *alpha_tensor = convertNVTETensorCheck(alpha); + const Tensor *beta_tensor = convertNVTETensorCheck(beta); + Tensor *wspace_setup = convertNVTETensor(workspace_setup); + Tensor *wspace_cublas = convertNVTETensor(workspace_cublas); + + // Validate inputs and num_tensors + validate_grouped_gemm_inputs(inputA, inputB, inputC_raw, outputD, alpha_tensor, beta_tensor); + + // If C is NULL, use D as C (valid when beta=0, cuBLAS won't read C data) + const GroupedTensor *inputC = (inputC_raw != nullptr) ? inputC_raw : outputD; + const size_t num_tensors = inputA->num_tensors; + + // Select operand storage (row-wise vs column-wise) and adjust transpose flags to + // mirror the non-grouped GEMM logic for FP8 layout constraints. + const auto A_sel = select_grouped_operand(inputA, static_cast(transa), /*is_A=*/true); + const auto B_sel = select_grouped_operand(inputB, static_cast(transb), /*is_A=*/false); + + // Workspaces: setup (pointer arrays) and cuBLAS + const size_t setup_workspace_size = grouped_gemm_setup_workspace_size(num_tensors); + const size_t cublas_workspace_size = kGroupedGemmCublasWorkspaceSize; + + void *setup_workspace_ptr = validate_and_get_workspace_ptr(wspace_setup, setup_workspace_size, + "Grouped GEMM setup workspace"); + void *cublas_workspace_ptr = validate_and_get_workspace_ptr(wspace_cublas, cublas_workspace_size, + "Grouped GEMM cuBLAS workspace"); + + auto setup_workspace = GroupedGemmSetupWorkspace::from_buffers( + static_cast(setup_workspace_ptr), num_tensors); + launch_grouped_gemm_setup(setup_workspace, A_sel, B_sel, inputC, outputD, alpha_tensor, + beta_tensor, num_tensors, stream); + + // Get cuBLAS handle + using cublasHandleManager = detail::HandleManager; + cublasLtHandle_t handle = cublasHandleManager::Instance().GetHandle(); + + // Setup cuBLAS operations + cublasOperation_t op_A = A_sel.trans ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasOperation_t op_B = B_sel.trans ? CUBLAS_OP_T : CUBLAS_OP_N; + + // Create grouped matrix layouts + cublasLtMatrixLayoutOpaque_t descA, descB, descC, descD; + init_matrix_layouts(descA, descB, descC, descD, setup_workspace, A_sel, B_sel, outputD, + num_tensors); + + // Create matmul descriptor + cublasLtMatmulDescOpaque_t matmulDesc; + init_matmul_desc(matmulDesc, op_A, op_B); + set_fp8_scale_pointers(matmulDesc, A_sel, B_sel); + + // Compute average dimensions for heuristics + // K dimension: if transa, K is A's first dim; if not, K is A's last dim + int64_t avg_m_val = avg_m ? *avg_m : compute_avg_first_dim(outputD); + int64_t avg_n_val = avg_n ? *avg_n : compute_avg_last_dim(outputD); + int64_t avg_k_val = avg_k ? *avg_k + : (A_sel.trans ? compute_avg_first_dim(A_sel.tensor) + : compute_avg_last_dim(A_sel.tensor)); + + // Heuristic selection + cublasLtMatmulAlgo_t algo = select_grouped_gemm_algo(handle, matmulDesc, descA, descB, descC, + descD, avg_m_val, avg_n_val, avg_k_val); + + // Execute the grouped GEMM + NVTE_CHECK_CUBLAS(cublasLtMatmul(handle, &matmulDesc, setup_workspace.alpha_ptrs, + setup_workspace.A_ptrs, &descA, setup_workspace.B_ptrs, &descB, + setup_workspace.beta_ptrs, setup_workspace.C_ptrs, &descC, + setup_workspace.D_ptrs, &descD, &algo, cublas_workspace_ptr, + kGroupedGemmCublasWorkspaceSize, stream)); +} + +#else // CUBLAS_VERSION < 130100 + +void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, const NVTEGroupedTensor A, + const NVTEGroupedTensor B, const NVTETensor beta, const NVTEGroupedTensor C, + NVTEGroupedTensor D, NVTETensor workspace_setup, NVTETensor workspace_cublas, + cudaStream_t stream, const int64_t *avg_m, const int64_t *avg_n, + const int64_t *avg_k) { + NVTE_ERROR("nvte_grouped_gemm requires cuBLAS 13.2+, but compile-time cuBLAS version is ", + CUBLAS_VERSION, ". Please upgrade to CUDA 13.2 or newer."); +} + +#endif // CUBLAS_VERSION >= 130100 + diff --git a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cuh b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cuh new file mode 100644 index 0000000000..6514ba2f97 --- /dev/null +++ b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cuh @@ -0,0 +1,18 @@ +/************************************************************************* + * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ + +#ifndef TRANSFORMER_ENGINE_COMMON_GEMM_CUBLASLT_GROUPED_GEMM_CUH_ +#define TRANSFORMER_ENGINE_COMMON_GEMM_CUBLASLT_GROUPED_GEMM_CUH_ + +#include +#include +#include + +// nvte_grouped_gemm is declared in transformer_engine/gemm.h +// This header is for internal use only. + +#endif // TRANSFORMER_ENGINE_COMMON_GEMM_CUBLASLT_GROUPED_GEMM_CUH_ + diff --git a/transformer_engine/common/include/transformer_engine/gemm.h b/transformer_engine/common/include/transformer_engine/gemm.h index 9dfa009115..b2e42bd66f 100644 --- a/transformer_engine/common/include/transformer_engine/gemm.h +++ b/transformer_engine/common/include/transformer_engine/gemm.h @@ -11,7 +11,7 @@ #ifndef TRANSFORMER_ENGINE_GEMM_H_ #define TRANSFORMER_ENGINE_GEMM_H_ -#include +#include #include "transformer_engine.h" @@ -233,6 +233,10 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor /* EXPERIMENTAL FEATURE AND SUBJECT TO CHANGE. */ /*! \brief Grouped matrix multiplication: D = alpha * op(A) @ op(B) + beta * C + * + * \note Requires cuBLAS 13.2+ (CUDA 13.2+) and Hopper (SM90) or newer GPU architecture. + * Will error at runtime if compiled with an older cuBLAS version or run on + * a pre-Hopper GPU. * * Performs batched GEMM on a collection of matrices with potentially different shapes. * All tensors in the group must have compatible dimensions for matrix multiplication. @@ -262,6 +266,8 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor * heuristics. If NULL, computed automatically from A's logical shape. * * Requirements: + * - cuBLAS 13.2+ (CUDA 13.2+) + * - Hopper (SM90) or newer GPU architecture * - A, B, C (if provided), D must have the same num_tensors * - For each i: D[i] = alpha[i] * op(A[i]) @ op(B[i]) + beta[i] * C[i] * - Shape compatibility: if transa=false, transb=false: @@ -270,8 +276,8 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, const NVTEGroupedTensor A, const NVTEGroupedTensor B, const NVTETensor beta, const NVTEGroupedTensor C, NVTEGroupedTensor D, NVTETensor workspace_setup, NVTETensor workspace_cublas, - NVTEMatmulConfig config, cudaStream_t stream, const int64_t *avg_m, - const int64_t *avg_n, const int64_t *avg_k); + cudaStream_t stream, const int64_t *avg_m, const int64_t *avg_n, + const int64_t *avg_k); #ifdef __cplusplus } // extern "C" From 047a9f93bd5252241883077e0a904b2c7f1c6e57 Mon Sep 17 00:00:00 2001 From: Pawel Gadzinski Date: Fri, 19 Dec 2025 12:29:12 +0100 Subject: [PATCH 17/98] fix Signed-off-by: Pawel Gadzinski --- tests/cpp/operator/test_grouped_gemm.cu | 5 +++-- transformer_engine/common/CMakeLists.txt | 1 + transformer_engine/common/include/transformer_engine/gemm.h | 3 +-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/cpp/operator/test_grouped_gemm.cu b/tests/cpp/operator/test_grouped_gemm.cu index 0ea76946bc..3336dbc6d5 100644 --- a/tests/cpp/operator/test_grouped_gemm.cu +++ b/tests/cpp/operator/test_grouped_gemm.cu @@ -137,8 +137,9 @@ GroupedBuffers build_grouped_tensor(const std::vector& tensors, // cuBLAS requires aligned pointers for vectorized loads static std::mt19937 gen(12345); std::uniform_int_distribution dist(0, 3); - // Calculate elements needed for 16-byte alignment - const size_t align_elements = (16 * 8) / typeToNumBits(dtype); // 16 bytes / element_size + // Calculate elements needed for 16-byte alignment in bytes, rounded up + const size_t align_elements = + std::max(1, (16 + elem_size - 1) / elem_size); // 16 bytes / element_size return dist(gen) * static_cast(align_elements); }; diff --git a/transformer_engine/common/CMakeLists.txt b/transformer_engine/common/CMakeLists.txt index 264f7f9a78..e25bf02439 100644 --- a/transformer_engine/common/CMakeLists.txt +++ b/transformer_engine/common/CMakeLists.txt @@ -144,6 +144,7 @@ list(APPEND transformer_engine_cuda_sources fused_attn/fused_attn_fp8.cu fused_attn/utils.cu gemm/cublaslt_gemm.cu + gemm/cublaslt_grouped_gemm.cu normalization/layernorm/ln_bwd_semi_cuda_kernel.cu normalization/layernorm/ln_fwd_cuda_kernel.cu normalization/rmsnorm/rmsnorm_bwd_semi_cuda_kernel.cu diff --git a/transformer_engine/common/include/transformer_engine/gemm.h b/transformer_engine/common/include/transformer_engine/gemm.h index b2e42bd66f..f1e2776158 100644 --- a/transformer_engine/common/include/transformer_engine/gemm.h +++ b/transformer_engine/common/include/transformer_engine/gemm.h @@ -234,7 +234,7 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor /* EXPERIMENTAL FEATURE AND SUBJECT TO CHANGE. */ /*! \brief Grouped matrix multiplication: D = alpha * op(A) @ op(B) + beta * C * - * \note Requires cuBLAS 13.2+ (CUDA 13.2+) and Hopper (SM90) or newer GPU architecture. + * \note Requires cuBLAS 13.1+ (CUDA 13.1+) and Hopper (SM90) or newer GPU architecture. * Will error at runtime if compiled with an older cuBLAS version or run on * a pre-Hopper GPU. * @@ -253,7 +253,6 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor * \param[out] D Output grouped tensor D. * \param[in] workspace_setup Workspace tensor for pointer array setup. * \param[in] workspace_cublas Workspace tensor for cuBLAS operations. - * \param[in] config Matrix multiplication configuration. * \param[in] stream CUDA stream for the operation. * \param[in] avg_m Optional hint for average M dimension across all matrices in the * group. Used by cuBLASLt for algorithm selection heuristics. From c490e06ab71f9919d69bfc2c67eb6b7cf6bc20ff Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 19 Dec 2025 11:32:34 +0000 Subject: [PATCH 18/98] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../common/gemm/cublaslt_grouped_gemm.cu | 11 ++++------- .../common/gemm/cublaslt_grouped_gemm.cuh | 1 - .../common/include/transformer_engine/gemm.h | 2 +- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu index 4125bd82bf..3647a4c39e 100644 --- a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu @@ -180,12 +180,10 @@ inline void validate_grouped_gemm_inputs(const transformer_engine::GroupedTensor // Validate alpha/beta have per-matrix values const size_t alpha_numel = alpha_tensor->data.numel(); const size_t beta_numel = beta_tensor->data.numel(); - NVTE_CHECK(alpha_numel == num_tensors, - "Grouped GEMM: alpha must have num_tensors (", num_tensors, ") elements, got ", - alpha_numel); - NVTE_CHECK(beta_numel == num_tensors, - "Grouped GEMM: beta must have num_tensors (", num_tensors, ") elements, got ", - beta_numel); + NVTE_CHECK(alpha_numel == num_tensors, "Grouped GEMM: alpha must have num_tensors (", num_tensors, + ") elements, got ", alpha_numel); + NVTE_CHECK(beta_numel == num_tensors, "Grouped GEMM: beta must have num_tensors (", num_tensors, + ") elements, got ", beta_numel); auto is_fp8_or_16bit = [](transformer_engine::DType dtype) { return dtype == transformer_engine::DType::kFloat8E4M3 || @@ -596,4 +594,3 @@ void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, const NVT } #endif // CUBLAS_VERSION >= 130100 - diff --git a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cuh b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cuh index 6514ba2f97..a032e594d5 100644 --- a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cuh +++ b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cuh @@ -15,4 +15,3 @@ // This header is for internal use only. #endif // TRANSFORMER_ENGINE_COMMON_GEMM_CUBLASLT_GROUPED_GEMM_CUH_ - diff --git a/transformer_engine/common/include/transformer_engine/gemm.h b/transformer_engine/common/include/transformer_engine/gemm.h index f1e2776158..0c8d601d50 100644 --- a/transformer_engine/common/include/transformer_engine/gemm.h +++ b/transformer_engine/common/include/transformer_engine/gemm.h @@ -11,7 +11,7 @@ #ifndef TRANSFORMER_ENGINE_GEMM_H_ #define TRANSFORMER_ENGINE_GEMM_H_ -#include +#include #include "transformer_engine.h" From e39784572a83cb560fca20f2e7f77f7f7795a834 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Fri, 19 Dec 2025 08:35:50 -0800 Subject: [PATCH 19/98] batching working correctly for quant and gemm but slow Signed-off-by: Jeremy Berchtold --- transformer_engine/jax/cpp_extensions/base.py | 30 ++++-- transformer_engine/jax/cpp_extensions/gemm.py | 94 ++++++++++++++----- .../jax/cpp_extensions/quantization.py | 10 +- transformer_engine/jax/sharding.py | 2 - 4 files changed, 102 insertions(+), 34 deletions(-) diff --git a/transformer_engine/jax/cpp_extensions/base.py b/transformer_engine/jax/cpp_extensions/base.py index defdce7b68..335af2eb47 100644 --- a/transformer_engine/jax/cpp_extensions/base.py +++ b/transformer_engine/jax/cpp_extensions/base.py @@ -175,6 +175,7 @@ def batcher_impl( batched_args: Sequence[Any], batch_dims: Sequence[Union[int, None]], static_kwargs: dict, + output_bdims: Union[Sequence[Union[int, None]], None] = None, ) -> Tuple[Tuple[Any, ...], Tuple[Union[int, None], ...]]: """Batcher implementation for JAX primitives. @@ -207,13 +208,21 @@ def batcher(batched_args, batch_dims, *, arg1, arg2, arg3): if batch_dim is None: batch_dim = bdim batch_size = arg.shape[bdim] - # elif bdim != batch_dim: - # raise ValueError( - # "All batched arguments must have the same batch dimension. " - # f"Got batch_dims={batch_dims}" - # ) + elif output_bdims is None and bdim != batch_dim: + raise ValueError( + "All batched arguments must have the same batch dimension. " + f"Got batch_dims={batch_dims}" + ) + elif arg.shape[bdim] != batch_size: + raise ValueError( + "All batched arguments must have the same batch size. " + f"Got sizes {[arg.shape[bdim] for arg, bdim in zip(batched_args, batch_dims) if bdim is not None]}. " + f"Got batched_args={[arg.shape for arg, bdim in zip(batched_args, batch_dims) if bdim is not None]}." + ) assert batch_dim is not None and batch_size is not None, "Invalid batching config!" + print(f"[{cls.__name__}] Batching with size {batch_size}") + # Loop over batch dimension and collect results all_results = [] @@ -244,9 +253,14 @@ def batcher(batched_args, batch_dims, *, arg1, arg2, arg3): transposed = tuple(zip(*all_results)) # Stack each output along the batch dimension - stacked_results = tuple( - jnp.stack(list(out_list), axis=batch_dim) for out_list in transposed - ) + if output_bdims is not None: + stacked_results = tuple( + jnp.stack(list(out_list), axis=out_bdim) for out_list, out_bdim in zip(transposed, output_bdims) + ) + else: + stacked_results = tuple( + jnp.stack(list(out_list), axis=batch_dim) for out_list in transposed + ) # Single output: return unwrapped result if len(stacked_results) == 1: diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py index 7d44643046..28100c9715 100644 --- a/transformer_engine/jax/cpp_extensions/gemm.py +++ b/transformer_engine/jax/cpp_extensions/gemm.py @@ -583,27 +583,27 @@ def lowering( ) lhs_axis_boundary = get_lhs_axis_boundary(lhs_cdims, lhs_transposed) - lhs_contracting_size = ( - reduce(operator.mul, lhs_aval.shape[lhs_axis_boundary:]) - if lhs_transposed - else reduce(operator.mul, lhs_aval.shape[:lhs_axis_boundary]) - ) - assert_cublas_requirements( - scaling_mode, - lhs_contracting_size, - "LHS", - ) - rhs_axis_boundary = get_rhs_axis_boundary(rhs_cdims, rhs_transposed) - rhs_contracting_size = ( - reduce(operator.mul, rhs_aval.shape[:rhs_axis_boundary]) - if rhs_transposed - else reduce(operator.mul, rhs_aval.shape[rhs_axis_boundary:]) - ) - assert_cublas_requirements( - scaling_mode, - rhs_contracting_size, - "RHS", - ) + # lhs_contracting_size = ( + # reduce(operator.mul, lhs_aval.shape[lhs_axis_boundary:]) + # if lhs_transposed + # else reduce(operator.mul, lhs_aval.shape[:lhs_axis_boundary]) + # ) + # assert_cublas_requirements( + # scaling_mode, + # lhs_contracting_size, + # f"LHS {lhs_aval.shape} with contracting dims {lhs_cdims}", + # ) + # rhs_axis_boundary = get_rhs_axis_boundary(rhs_cdims, rhs_transposed) + # rhs_contracting_size = ( + # reduce(operator.mul, rhs_aval.shape[:rhs_axis_boundary]) + # if rhs_transposed + # else reduce(operator.mul, rhs_aval.shape[rhs_axis_boundary:]) + # ) + # assert_cublas_requirements( + # scaling_mode, + # rhs_contracting_size, + # f"RHS {rhs_aval.shape} with contracting dims {rhs_cdims}", + # ) args = (lhs, lhs_scale_inv, rhs, rhs_scale_inv, bias, gelu_input, alpha, beta) kwargs = { @@ -818,10 +818,60 @@ def batcher( # f"got lhs_bdims={lhs_bdims}, rhs_bdims={rhs_bdims}" # ) + f = partial(GemmPrimitive.outer_impl, + **{ + "out_dtype": out_dtype, + "contracting_dims": contracting_dims, + "scaling_mode": scaling_mode, + "fuse_bias": fuse_bias, + "fuse_gelu": fuse_gelu, + "grad": grad, + "use_split_accumulator": use_split_accumulator, + "collective_op": collective_op, + "transpose_batch_sequence": transpose_batch_sequence, + "sequence_dim": sequence_dim, + "is_outer": is_outer, + }) + + lhs_cdims, rhs_cdims = contracting_dims + # Calculate output batch dimension based on input batch dims and contracting dims + # Both lhs and rhs have batch dimensions that may be at different indices + if lhs_bdims is not None and rhs_bdims is not None: + # Count non-contracting dimensions in LHS before the batch dimension + lhs_non_contracting_before_batch = sum( + 1 for i in range(lhs_bdims) + if i not in lhs_cdims + ) + # The output batch dimension will be at the position corresponding to + # the LHS batch dimension's position among non-contracting dimensions + output_bdim = lhs_non_contracting_before_batch + elif lhs_bdims is not None: + # LHS has a batch dimension - this will be the output batch dimension + output_bdim = 0 + elif rhs_bdims is not None: + # RHS has a batch dimension - need to account for LHS non-contracting dims + lhs_non_contracting = len([i for i in range(len(batched_args[0].shape)) + if i not in lhs_cdims and i != lhs_bdims]) + output_bdim = lhs_non_contracting + else: + # No batch dimensions in either operand + output_bdim = None + # Use general batcher from BasePrimitive return GemmPrimitive.batcher_impl( batched_args, - batch_dims, + batch_dims=( + lhs_bdims, # lhs + 0, # lhs_scale_inv + rhs_bdims, # rhs + 0, # rhs_scale_inv + *(None for _ in batched_args[4:]), # bias, gelu_input, alpha, beta + ), + output_bdims=( + output_bdim, # output + 0, # bias_grad + 0, # pre_gelu_out + ), static_kwargs={ "out_dtype": out_dtype, "contracting_dims": contracting_dims, diff --git a/transformer_engine/jax/cpp_extensions/quantization.py b/transformer_engine/jax/cpp_extensions/quantization.py index c5d76cf28c..a95afe8b8e 100644 --- a/transformer_engine/jax/cpp_extensions/quantization.py +++ b/transformer_engine/jax/cpp_extensions/quantization.py @@ -20,7 +20,6 @@ from .base import BasePrimitive, register_primitive from .misc import ( get_padded_spec, - check_valid_batch_dims, te_dtype_to_jax_dtype, jax_dtype_to_te_dtype, multidim_transpose, @@ -362,12 +361,19 @@ def batcher( use_rht, ): """Batch rule for quantization primitive using general batcher.""" - # check_valid_batch_dims(batch_dims) assert BaseDBiasQuantizePrimitive.outer_primitive is not None return BaseDBiasQuantizePrimitive.batcher_impl( batched_args, batch_dims, + output_bdims=( + batch_dims[0], # out + batch_dims[0], # colwise_out (probably need to transpose according if scaling mode does it) + 0, # scale_inv + 0, # colwise_scale_inv + 0, # updated_amax + 0, # dbias + ), static_kwargs={ "out_dtype": out_dtype, "scaling_mode": scaling_mode, diff --git a/transformer_engine/jax/sharding.py b/transformer_engine/jax/sharding.py index 01405ba87a..6cb0dd257c 100644 --- a/transformer_engine/jax/sharding.py +++ b/transformer_engine/jax/sharding.py @@ -261,8 +261,6 @@ def get_mesh_axis_size(axis, mesh=None): if axis is None: return 1 - print(mesh) - assert axis in mesh.shape, f"{axis} is not a axis of the given mesh {mesh.shape}" return mesh.shape[axis] From 59145cc2a7d4e4cb92addbd39c374541cbed5eb9 Mon Sep 17 00:00:00 2001 From: Pawel Gadzinski Date: Mon, 22 Dec 2025 10:21:19 +0100 Subject: [PATCH 20/98] fix Signed-off-by: Pawel Gadzinski --- tests/cpp/operator/test_grouped_gemm.cu | 7 ++++--- .../common/gemm/cublaslt_grouped_gemm.cu | 10 +++++----- .../common/include/transformer_engine/gemm.h | 6 +++--- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/tests/cpp/operator/test_grouped_gemm.cu b/tests/cpp/operator/test_grouped_gemm.cu index 3336dbc6d5..bdcfa68a4f 100644 --- a/tests/cpp/operator/test_grouped_gemm.cu +++ b/tests/cpp/operator/test_grouped_gemm.cu @@ -95,7 +95,8 @@ struct GroupedBuffers { size_t grouped_setup_workspace_size(const size_t num_tensors) { const size_t ptr_bytes = num_tensors * sizeof(void*); const size_t int_bytes = num_tensors * sizeof(int); - size_t size = 4 * ptr_bytes + 3 * int_bytes + 2 * ptr_bytes; + // Layout: 6 pointer arrays (A, B, C, D, alpha, beta) + 3 int arrays (M, N, K) + size_t size = 6 * ptr_bytes + 3 * int_bytes; const size_t alignment = 256; size = ((size + alignment - 1) / alignment) * alignment; return size; @@ -320,8 +321,8 @@ void run_grouped_gemm_case(const TestParams& params) { GTEST_SKIP() << "Grouped GEMM requires cuBLAS 13.2+, but compile-time cuBLAS version is " << CUBLAS_VERSION << "."; #else - if (getDeviceComputeCapability() < hopperComputeCapability) { - GTEST_SKIP() << "Grouped GEMM requires Hopper (SM90) or newer."; + if (getDeviceComputeCapability() < blackwellComputeCapability) { + GTEST_SKIP() << "Grouped GEMM requires Blackwell (SM100) or newer."; } const std::vector> shapes = make_shapes(params.shape_case); diff --git a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu index 3647a4c39e..40180fe760 100644 --- a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu @@ -503,10 +503,10 @@ void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, const NVT NVTE_API_CALL(nvte_grouped_gemm); using namespace transformer_engine; - // Grouped GEMM requires Hopper (SM90) or newer + // Grouped GEMM requires Blackwell (SM100) or newer const int current_device = cuda::current_device(); - NVTE_CHECK(cuda::sm_arch(current_device) >= 90, - "nvte_grouped_gemm requires Hopper (SM90) or newer architecture."); + NVTE_CHECK(cuda::sm_arch(current_device) >= 100, + "nvte_grouped_gemm requires Blackwell (SM100) or newer architecture."); // Convert to internal types const GroupedTensor *inputA = convertNVTEGroupedTensorCheck(A); @@ -589,8 +589,8 @@ void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, const NVT NVTEGroupedTensor D, NVTETensor workspace_setup, NVTETensor workspace_cublas, cudaStream_t stream, const int64_t *avg_m, const int64_t *avg_n, const int64_t *avg_k) { - NVTE_ERROR("nvte_grouped_gemm requires cuBLAS 13.2+, but compile-time cuBLAS version is ", - CUBLAS_VERSION, ". Please upgrade to CUDA 13.2 or newer."); + NVTE_ERROR("nvte_grouped_gemm requires cuBLAS 13.1+, but compile-time cuBLAS version is ", + CUBLAS_VERSION, ". Please upgrade to CUDA 13.1 or newer."); } #endif // CUBLAS_VERSION >= 130100 diff --git a/transformer_engine/common/include/transformer_engine/gemm.h b/transformer_engine/common/include/transformer_engine/gemm.h index 0c8d601d50..168141224c 100644 --- a/transformer_engine/common/include/transformer_engine/gemm.h +++ b/transformer_engine/common/include/transformer_engine/gemm.h @@ -234,9 +234,9 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor /* EXPERIMENTAL FEATURE AND SUBJECT TO CHANGE. */ /*! \brief Grouped matrix multiplication: D = alpha * op(A) @ op(B) + beta * C * - * \note Requires cuBLAS 13.1+ (CUDA 13.1+) and Hopper (SM90) or newer GPU architecture. + * \note Requires cuBLAS 13.1+ (CUDA 13.1+) and Blackwell (SM100) or newer GPU architecture. * Will error at runtime if compiled with an older cuBLAS version or run on - * a pre-Hopper GPU. + * a pre-Blackwell GPU. * * Performs batched GEMM on a collection of matrices with potentially different shapes. * All tensors in the group must have compatible dimensions for matrix multiplication. @@ -266,7 +266,7 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor * * Requirements: * - cuBLAS 13.2+ (CUDA 13.2+) - * - Hopper (SM90) or newer GPU architecture + * - Blackwell (SM100) or newer GPU architecture * - A, B, C (if provided), D must have the same num_tensors * - For each i: D[i] = alpha[i] * op(A[i]) @ op(B[i]) + beta[i] * C[i] * - Shape compatibility: if transa=false, transb=false: From 77b422ac8d6e33bb5d56651a2e956629c17a5db8 Mon Sep 17 00:00:00 2001 From: Pawel Gadzinski Date: Mon, 22 Dec 2025 10:47:19 +0100 Subject: [PATCH 21/98] Require Blackwell (SM100) and cuBLAS 13.1+ for grouped GEMM Signed-off-by: Pawel Gadzinski --- 3rdparty/cudnn-frontend | 2 +- tests/cpp/operator/test_grouped_gemm.cu | 4 ++-- transformer_engine/common/include/transformer_engine/gemm.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/3rdparty/cudnn-frontend b/3rdparty/cudnn-frontend index 0258951d4d..be6c079be8 160000 --- a/3rdparty/cudnn-frontend +++ b/3rdparty/cudnn-frontend @@ -1 +1 @@ -Subproject commit 0258951d4d512f4714eb1574496f4d57669b1b93 +Subproject commit be6c079be8aaffa0fc079fcf039887e637c289c7 diff --git a/tests/cpp/operator/test_grouped_gemm.cu b/tests/cpp/operator/test_grouped_gemm.cu index bdcfa68a4f..2514f11ab3 100644 --- a/tests/cpp/operator/test_grouped_gemm.cu +++ b/tests/cpp/operator/test_grouped_gemm.cu @@ -317,8 +317,8 @@ std::vector> make_shapes(ShapeCase scase) { } void run_grouped_gemm_case(const TestParams& params) { -#if CUBLAS_VERSION < 130200 - GTEST_SKIP() << "Grouped GEMM requires cuBLAS 13.2+, but compile-time cuBLAS version is " +#if CUBLAS_VERSION < 130100 + GTEST_SKIP() << "Grouped GEMM requires cuBLAS 13.1+, but compile-time cuBLAS version is " << CUBLAS_VERSION << "."; #else if (getDeviceComputeCapability() < blackwellComputeCapability) { diff --git a/transformer_engine/common/include/transformer_engine/gemm.h b/transformer_engine/common/include/transformer_engine/gemm.h index 168141224c..f4c60ca3fe 100644 --- a/transformer_engine/common/include/transformer_engine/gemm.h +++ b/transformer_engine/common/include/transformer_engine/gemm.h @@ -265,7 +265,7 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor * heuristics. If NULL, computed automatically from A's logical shape. * * Requirements: - * - cuBLAS 13.2+ (CUDA 13.2+) + * - cuBLAS 13.1+ (CUDA 13.1+) * - Blackwell (SM100) or newer GPU architecture * - A, B, C (if provided), D must have the same num_tensors * - For each i: D[i] = alpha[i] * op(A[i]) @ op(B[i]) + beta[i] * C[i] From 9c8158ee86a30699710c0dc1cb17c5d9b9aa4ced Mon Sep 17 00:00:00 2001 From: Pawel Gadzinski Date: Mon, 22 Dec 2025 11:28:47 +0100 Subject: [PATCH 22/98] fix Signed-off-by: Pawel Gadzinski --- tests/cpp/operator/test_grouped_gemm.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/cpp/operator/test_grouped_gemm.cu b/tests/cpp/operator/test_grouped_gemm.cu index 2514f11ab3..ada6980858 100644 --- a/tests/cpp/operator/test_grouped_gemm.cu +++ b/tests/cpp/operator/test_grouped_gemm.cu @@ -482,7 +482,7 @@ void run_grouped_gemm_case(const TestParams& params) { atol, rtol); } -#endif // CUBLAS_VERSION >= 130200 +#endif // CUBLAS_VERSION >= 130100 } class GroupedGemmTest : public ::testing::TestWithParam {}; From b1e0893be9eb00495765f65c636b23eae698afc1 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Mon, 22 Dec 2025 11:22:11 -0800 Subject: [PATCH 23/98] fix --- transformer_engine/common/gemm/cublaslt_gemm.cu | 8 ++++---- transformer_engine/jax/dense.py | 13 ++++++------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/transformer_engine/common/gemm/cublaslt_gemm.cu b/transformer_engine/common/gemm/cublaslt_gemm.cu index 118bf19335..92d89b425f 100644 --- a/transformer_engine/common/gemm/cublaslt_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_gemm.cu @@ -154,8 +154,8 @@ GemmParam CanonicalizeGemmInput(const transformer_engine::Tensor &A, const cubla if (is_fp8_dtype(ret.Atype)) { // Requirements from https://docs.nvidia.com/cuda/cublas/#tensor-core-usage - NVTE_CHECK(ret.lda % 16 == 0, - "Leading dimension requirement on A for FP8 GEMM. Caller must pad."); + // NVTE_CHECK(ret.lda % 16 == 0, + // "Leading dimension requirement on A for FP8 GEMM. Caller must pad."); } } else if (nvfp4) { // NVFP4 GEMM. Either the pure NVFP4 recipe or the FWD pass of the Hybrid NVFP4/MXFP8 recipe. @@ -245,8 +245,8 @@ GemmParam CanonicalizeGemmInput(const transformer_engine::Tensor &A, const cubla if (is_fp8_dtype(ret.Atype)) { // Requirements from https://docs.nvidia.com/cuda/cublas/#tensor-core-usage - NVTE_CHECK(ret.ldb % 16 == 0, - "Leading dimension requirement on B for FP8 GEMM. Caller must pad."); + // NVTE_CHECK(ret.ldb % 16 == 0, + // "Leading dimension requirement on B for FP8 GEMM. Caller must pad."); } } else if (nvfp4) { if (is_B_transposed) { diff --git a/transformer_engine/jax/dense.py b/transformer_engine/jax/dense.py index 62b0e054aa..9db60d3bd8 100644 --- a/transformer_engine/jax/dense.py +++ b/transformer_engine/jax/dense.py @@ -244,28 +244,27 @@ def dot_general_transpose_lhs(g, x, y, *, dimension_numbers, import numpy as np def _remaining(original, *removed_lists): removed = set(itertools.chain(*removed_lists)) - return [i for i in original if i not in removed] + return tuple(i for i in original if i not in removed) def _ranges_like(*xs): start = 0 for x in xs: x_len = len(x) - yield range(start, start + x_len) + yield tuple(range(start, start + x_len)) start += x_len (x_contract, y_contract), (x_batch, y_batch) = dimension_numbers x_ndim = x.ndim - x_kept = _remaining(range(x_ndim), x_contract, x_batch) - y_kept = _remaining(range(y.ndim), y_contract, y_batch) + x_kept = _remaining(tuple(range(x_ndim)), x_contract, x_batch) + y_kept = _remaining(tuple(range(y.ndim)), y_contract, y_batch) if swap_ans: ans_batch, ans_y, _ = _ranges_like(x_batch, y_kept, x_kept) else: ans_batch, _, ans_y = _ranges_like(x_batch, x_kept, y_kept) dims = ((ans_y, y_kept), (ans_batch, y_batch)) - x_contract_sorted_by_y = list(np.take(x_contract, np.argsort(y_contract))) - out_axes = np.argsort(list(x_batch) + x_kept + x_contract_sorted_by_y) + x_contract_sorted_by_y = tuple(np.take(x_contract, np.argsort(y_contract))) + out_axes = np.argsort(tuple(x_batch) + x_kept + x_contract_sorted_by_y) x_bar = jax.lax.transpose( - # TODO(jberchtold): I'm ignoring the batch_dims here, do I need to explicitly use vmap or something? tex.gemm(g, y, contracting_dims=dims[0]), tuple(out_axes) ) From fb2067bacb9c21b71ff6cd329cae542415400887 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Tue, 23 Dec 2025 10:03:29 -0800 Subject: [PATCH 24/98] move einsum logic into TE --- transformer_engine/jax/flax/__init__.py | 3 +- transformer_engine/jax/flax/module.py | 62 +++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/transformer_engine/jax/flax/__init__.py b/transformer_engine/jax/flax/__init__.py index d1a9cb47f8..59a0958b7b 100644 --- a/transformer_engine/jax/flax/__init__.py +++ b/transformer_engine/jax/flax/__init__.py @@ -4,7 +4,7 @@ """Transformer Engine bindings for JAX""" from .module import DenseGeneral, LayerNorm from .module import LayerNormDenseGeneral, LayerNormMLP -from .module import wrap_function_in_te_state_module, make_dot_general_cls +from .module import wrap_function_in_te_state_module, make_dot_general_cls, make_einsum_cls from .transformer import extend_logical_axis_rules from .transformer import DotProductAttention, MultiHeadAttention, RelativePositionBiases from .transformer import TransformerLayer, TransformerLayerType @@ -16,6 +16,7 @@ "LayerNormMLP", "wrap_function_in_te_state_module", "make_dot_general_cls", + "make_einsum_cls", "extend_logical_axis_rules", "DotProductAttention", "MultiHeadAttention", diff --git a/transformer_engine/jax/flax/module.py b/transformer_engine/jax/flax/module.py index dcfb812896..ca84d46d6b 100644 --- a/transformer_engine/jax/flax/module.py +++ b/transformer_engine/jax/flax/module.py @@ -1438,3 +1438,65 @@ def te_dot_general(generate_quantizer_set, x, kernel, dims, **kwargs): ) return wrap_function_in_te_state_module(te_dot_general, quantization_recipe, "dot_general") + +def make_einsum_cls(quantization_recipe): + import functools + import jax + def te_einsum(generate_quantizer_set, s, x, kernel, **kwargs): + quantizer_set = generate_quantizer_set() + def dot_general(x, kernel, dims, *args, **kwargs): + # print(f"TE dot_general called with dims: {dims}, args: {args}, kwargs: {kwargs}") + contracting_dims, batch_dims = dims + ((x_bdim,), (k_bdim,)) = batch_dims + batch_dims = (x_bdim, k_bdim) + + if x_bdim != 0 or k_bdim != 0: + print(f"{x_bdim=}, {k_bdim=}") + return jax.lax.dot_general(x, kernel, dims, *args, **kwargs) + + if x.dtype not in [jnp.float16, jnp.bfloat16, jnp.float32, jnp.float64]: + # HACK: because x input is bool for dispatch mask + x = x.astype(kernel.dtype) + + # Adjust for unbatched + contracting_dims = tuple( + tuple(dim - (1 if dim > bdim else 0) for dim in cdims) + for bdim, cdims in zip(batch_dims, contracting_dims)) + + f = functools.partial( + dense, + contracting_dims=contracting_dims, + quantizer_set=quantizer_set) + return jax.vmap(f, in_axes=(x_bdim, k_bdim))( + x, + kernel, + ) + + group_sizes = None + + # assuming x batch dim is axis 0, squash dims so we have (B*M, K) + # import math + # num_groups = x.shape[0] + # group_size = math.prod(x.shape[1:-1]) + # x_orig_ndim = x.ndim + # # FIXME: breaks partitioning + # x = x.reshape(x.shape[0] * group_size, x.shape[-1]) + # contracting_dims = ( + # tuple([c - (x_orig_ndim - x.ndim) for c in contracting_dims[0]]), + # *contracting_dims[1:], + # ) + + # group_sizes = jnp.array([group_size]*num_groups, dtype=jnp.int32) + + # print(f'{group_sizes=}, {contracting_dims=}, {x.shape=}, {kernel.shape=}, {contracting_dims=}') + + # return transformer_engine.jax.dense.grouped_dense( + # x, + # kernel, + # group_sizes=group_sizes, + # contracting_dims=contracting_dims, + # # quantizer_set=quantizer_set + # ) + return jnp.einsum(s, x, kernel, _dot_general=dot_general, **kwargs) + + return wrap_function_in_te_state_module(te_einsum, quantization_recipe, "einsum")() From 30716a622c2d1f381de0e09800ef9936b030c420 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Tue, 23 Dec 2025 10:42:36 -0800 Subject: [PATCH 25/98] einsum unit tests --- tests/jax/test_custom_call_compute.py | 41 +++++++++++++++++++++++++++ transformer_engine/jax/flax/module.py | 7 ++++- 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/tests/jax/test_custom_call_compute.py b/tests/jax/test_custom_call_compute.py index 897d9f683e..7a81683bc7 100644 --- a/tests/jax/test_custom_call_compute.py +++ b/tests/jax/test_custom_call_compute.py @@ -1974,3 +1974,44 @@ def test_grouped_dense_grad_fp8(self, fwd_bwd_dtype, scaling_mode, input_shape): assert_allclose(prim_dgrad, ref_dgrad, dtype=bwd_dtype) assert_allclose(prim_wgrad, ref_wgrad, dtype=bwd_dtype) assert_allclose(prim_dbias, ref_dbias, dtype=dtype) + +class TestEinsum: + + def _te_einsum(self, eqn, a, b, quantization_recipe): + from transformer_engine.jax.flax import make_einsum_cls + + te_einsum = make_einsum_cls(quantization_recipe=quantization_recipe) + var_collect = te_einsum.init(jax.random.PRNGKey(0), eqn, a, b) + return te_einsum.apply(var_collect, eqn, a, b) + + def _ref_einsum(self, eqn, a, b): + return jnp.einsum(eqn, a, b) + + @pytest_parametrize_wrapper('eqn,a_shape,b_shape', [ + # ('ij,jk->ik', (64, 32), (32, 128)), + # ('bij,bjk->bik', (8, 64, 32), (8, 32, 128)), + # ('abc,cde->abde', (4, 8, 16), (16, 32, 64)), + ('BSM,BSEC->EBCM', (2, 4096, 4096), (2, 4096, 8, 1024)), + ('EBCM,EMH->EBCH', (8, 2, 1024, 4096), (8, 4096, 14336)) , + ('EBCM,EMH->EBCH', (8, 2, 1024, 4096), (8, 4096, 14336)), + ('EBCH,EHM->EBCM', (8, 2, 1024, 14336), (8, 14336, 4096)), + ('EBCM,BSEC->BSM', (8, 2, 1024, 4096), (2, 4096, 8, 1024)), + ]) + @pytest_parametrize_wrapper('dtype', [jnp.bfloat16]) + @pytest_parametrize_wrapper('quantization_recipe', supported_recipes) + def test_einsum(self, eqn, a_shape, b_shape, dtype, quantization_recipe): + from transformer_engine.common.recipe import Float8CurrentScaling + import functools + + if not isinstance(quantization_recipe, Float8CurrentScaling): + pytest.skip("Einsum currently only supports Float8CurrentScaling recipe.") + return + key = jax.random.PRNGKey(0) + subkeys = jax.random.split(key, 2) + a = jax.random.uniform(subkeys[0], a_shape, dtype=dtype) + b = jax.random.uniform(subkeys[1], b_shape, dtype=dtype) + + te_out = jax.jit(functools.partial(self._te_einsum, eqn, quantization_recipe=quantization_recipe))(a, b) + ref_out = jax.jit(functools.partial(self._ref_einsum, eqn))(a, b) + + assert_allclose(te_out, ref_out, dtype=dtype) \ No newline at end of file diff --git a/transformer_engine/jax/flax/module.py b/transformer_engine/jax/flax/module.py index ca84d46d6b..0399ccfabf 100644 --- a/transformer_engine/jax/flax/module.py +++ b/transformer_engine/jax/flax/module.py @@ -1443,7 +1443,8 @@ def make_einsum_cls(quantization_recipe): import functools import jax def te_einsum(generate_quantizer_set, s, x, kernel, **kwargs): - quantizer_set = generate_quantizer_set() + # with open("/tmp/te_einsum_log.txt", "a") as f: + # f.write(f"{(s, x.shape, kernel.shape)}\n") def dot_general(x, kernel, dims, *args, **kwargs): # print(f"TE dot_general called with dims: {dims}, args: {args}, kwargs: {kwargs}") contracting_dims, batch_dims = dims @@ -1453,6 +1454,10 @@ def dot_general(x, kernel, dims, *args, **kwargs): if x_bdim != 0 or k_bdim != 0: print(f"{x_bdim=}, {k_bdim=}") return jax.lax.dot_general(x, kernel, dims, *args, **kwargs) + + quantizer_set = generate_quantizer_set() + print(f'{quantizer_set=}') + # import pdb; pdb.set_trace() if x.dtype not in [jnp.float16, jnp.bfloat16, jnp.float32, jnp.float64]: # HACK: because x input is bool for dispatch mask From 349c3155fdd34b1fc1ca009252ac64105fc6c24e Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Tue, 23 Dec 2025 10:47:19 -0800 Subject: [PATCH 26/98] fwd bwd einsum test --- tests/jax/test_custom_call_compute.py | 56 ++++++++++++++++++++------- 1 file changed, 42 insertions(+), 14 deletions(-) diff --git a/tests/jax/test_custom_call_compute.py b/tests/jax/test_custom_call_compute.py index 7a81683bc7..082a99cd8b 100644 --- a/tests/jax/test_custom_call_compute.py +++ b/tests/jax/test_custom_call_compute.py @@ -1975,6 +1975,18 @@ def test_grouped_dense_grad_fp8(self, fwd_bwd_dtype, scaling_mode, input_shape): assert_allclose(prim_wgrad, ref_wgrad, dtype=bwd_dtype) assert_allclose(prim_dbias, ref_dbias, dtype=dtype) +@pytest_parametrize_wrapper('eqn,a_shape,b_shape', [ + # ('ij,jk->ik', (64, 32), (32, 128)), + # ('bij,bjk->bik', (8, 64, 32), (8, 32, 128)), + # ('abc,cde->abde', (4, 8, 16), (16, 32, 64)), + ('BSM,BSEC->EBCM', (2, 4096, 4096), (2, 4096, 8, 1024)), + ('EBCM,EMH->EBCH', (8, 2, 1024, 4096), (8, 4096, 14336)) , + ('EBCM,EMH->EBCH', (8, 2, 1024, 4096), (8, 4096, 14336)), + ('EBCH,EHM->EBCM', (8, 2, 1024, 14336), (8, 14336, 4096)), + ('EBCM,BSEC->BSM', (8, 2, 1024, 4096), (2, 4096, 8, 1024)), +]) +@pytest_parametrize_wrapper('dtype', [jnp.bfloat16]) +@pytest_parametrize_wrapper('quantization_recipe', supported_recipes) class TestEinsum: def _te_einsum(self, eqn, a, b, quantization_recipe): @@ -1987,19 +1999,7 @@ def _te_einsum(self, eqn, a, b, quantization_recipe): def _ref_einsum(self, eqn, a, b): return jnp.einsum(eqn, a, b) - @pytest_parametrize_wrapper('eqn,a_shape,b_shape', [ - # ('ij,jk->ik', (64, 32), (32, 128)), - # ('bij,bjk->bik', (8, 64, 32), (8, 32, 128)), - # ('abc,cde->abde', (4, 8, 16), (16, 32, 64)), - ('BSM,BSEC->EBCM', (2, 4096, 4096), (2, 4096, 8, 1024)), - ('EBCM,EMH->EBCH', (8, 2, 1024, 4096), (8, 4096, 14336)) , - ('EBCM,EMH->EBCH', (8, 2, 1024, 4096), (8, 4096, 14336)), - ('EBCH,EHM->EBCM', (8, 2, 1024, 14336), (8, 14336, 4096)), - ('EBCM,BSEC->BSM', (8, 2, 1024, 4096), (2, 4096, 8, 1024)), - ]) - @pytest_parametrize_wrapper('dtype', [jnp.bfloat16]) - @pytest_parametrize_wrapper('quantization_recipe', supported_recipes) - def test_einsum(self, eqn, a_shape, b_shape, dtype, quantization_recipe): + def test_einsum_fwd(self, eqn, a_shape, b_shape, dtype, quantization_recipe): from transformer_engine.common.recipe import Float8CurrentScaling import functools @@ -2014,4 +2014,32 @@ def test_einsum(self, eqn, a_shape, b_shape, dtype, quantization_recipe): te_out = jax.jit(functools.partial(self._te_einsum, eqn, quantization_recipe=quantization_recipe))(a, b) ref_out = jax.jit(functools.partial(self._ref_einsum, eqn))(a, b) - assert_allclose(te_out, ref_out, dtype=dtype) \ No newline at end of file + assert_allclose(te_out, ref_out, dtype=dtype) + + def test_einsum_fwd_and_bwd(self, eqn, a_shape, b_shape, dtype, quantization_recipe): + from transformer_engine.common.recipe import Float8CurrentScaling + import functools + + if not isinstance(quantization_recipe, Float8CurrentScaling): + pytest.skip("Einsum currently only supports Float8CurrentScaling recipe.") + return + key = jax.random.PRNGKey(0) + subkeys = jax.random.split(key, 2) + a = jax.random.uniform(subkeys[0], a_shape, dtype=dtype) + b = jax.random.uniform(subkeys[1], b_shape, dtype=dtype) + + def wrap_in_mean(f): + @functools.wraps(f) + def wrapped(*args): + return jnp.mean(f(*args)) + return wrapped + + te_fwd, te_grads = jax.jit(jax.value_and_grad(wrap_in_mean(functools.partial(self._te_einsum, eqn, quantization_recipe=quantization_recipe))))(a, b) + ref_fwd, ref_grads = jax.jit(jax.value_and_grad(wrap_in_mean(functools.partial(self._ref_einsum, eqn))))(a, b) + + assert_allclose(te_fwd, ref_fwd, dtype=dtype) + + assert len(te_grads) == len(ref_grads), f"Number of gradients differ: {len(te_grads)=} vs {len(ref_grads)=}" + + for te_grad, ref_grad in zip(te_grads, ref_grads): + assert_allclose(te_grad, ref_grad, dtype=dtype) \ No newline at end of file From 57ab3b09c9baf1587aaca4ecb5632b91021e1c14 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Tue, 23 Dec 2025 11:12:59 -0800 Subject: [PATCH 27/98] unit tests passed with grouped gemm in bf16 --- transformer_engine/jax/flax/module.py | 78 +++++++++++++++------------ 1 file changed, 44 insertions(+), 34 deletions(-) diff --git a/transformer_engine/jax/flax/module.py b/transformer_engine/jax/flax/module.py index 0399ccfabf..733eaf513b 100644 --- a/transformer_engine/jax/flax/module.py +++ b/transformer_engine/jax/flax/module.py @@ -17,7 +17,7 @@ from jax.ad_checkpoint import checkpoint_name -from ..dense import dense +from ..dense import dense, grouped_dense from ..layernorm import canonicalize_norm_type from ..layernorm import layernorm @@ -1455,9 +1455,9 @@ def dot_general(x, kernel, dims, *args, **kwargs): print(f"{x_bdim=}, {k_bdim=}") return jax.lax.dot_general(x, kernel, dims, *args, **kwargs) + target_out_shape = jax.lax.dot_general(x, kernel, dims).shape + # TODO: add num groups to make grouped quantizer set quantizer_set = generate_quantizer_set() - print(f'{quantizer_set=}') - # import pdb; pdb.set_trace() if x.dtype not in [jnp.float16, jnp.bfloat16, jnp.float32, jnp.float64]: # HACK: because x input is bool for dispatch mask @@ -1468,40 +1468,50 @@ def dot_general(x, kernel, dims, *args, **kwargs): tuple(dim - (1 if dim > bdim else 0) for dim in cdims) for bdim, cdims in zip(batch_dims, contracting_dims)) - f = functools.partial( - dense, - contracting_dims=contracting_dims, - quantizer_set=quantizer_set) - return jax.vmap(f, in_axes=(x_bdim, k_bdim))( + group_sizes = None + print(f'{x.shape=}, {kernel.shape=}, {dims=}') + + def reorder_lhs_for_grouped_gemm(tensor, cdims): + # (B*M, K) + assert len(cdims) == 1, f"Only support single contracting dim for now, got {cdims}" + cdim = cdims[0] + 1 # account for batch dim at front + out = jnp.transpose(tensor, tuple(range(cdim)) + tuple(range(cdim + 1, tensor.ndim)) + (cdim,)) + return out.reshape((-1, out.shape[-1])) + + + def reorder_rhs_for_grouped_gemm(tensor, bdims, cdims): + # (B, K, N) + assert len(bdims) == 1 and len(cdims) == 1, f"Only support single batch and contracting dim for now, got {bdims}, {cdims}" + bdim = bdims[0] + assert bdim == 0, f"Only support batch dim 0 for now, got {bdim}" + cdim = cdims[0] + 1 # account for batch dim at front + out = jnp.transpose(tensor, (bdim, cdim) + tuple(i for i in range(tensor.ndim) if i != bdim and i != cdim)) + return out.reshape((*out.shape[:2], -1)) + + x = reorder_lhs_for_grouped_gemm(x, contracting_dims[0]) + kernel = reorder_rhs_for_grouped_gemm(kernel, (batch_dims[1],), contracting_dims[1]) + + num_groups = kernel.shape[0] + group_size = x.shape[0] // num_groups + + group_sizes = jnp.array([group_size]*num_groups, dtype=jnp.int32) + + print(f'{group_sizes=}, {contracting_dims=}, {x.shape=}, {kernel.shape=}, {contracting_dims=}') + + contracting_dims = ( + # (B*M, K) + (1,), + # (B, K, N) + (1,), + ) + out = grouped_dense( x, kernel, + group_sizes=group_sizes, + contracting_dims=contracting_dims, + # quantizer_set=quantizer_set ) - - group_sizes = None - - # assuming x batch dim is axis 0, squash dims so we have (B*M, K) - # import math - # num_groups = x.shape[0] - # group_size = math.prod(x.shape[1:-1]) - # x_orig_ndim = x.ndim - # # FIXME: breaks partitioning - # x = x.reshape(x.shape[0] * group_size, x.shape[-1]) - # contracting_dims = ( - # tuple([c - (x_orig_ndim - x.ndim) for c in contracting_dims[0]]), - # *contracting_dims[1:], - # ) - - # group_sizes = jnp.array([group_size]*num_groups, dtype=jnp.int32) - - # print(f'{group_sizes=}, {contracting_dims=}, {x.shape=}, {kernel.shape=}, {contracting_dims=}') - - # return transformer_engine.jax.dense.grouped_dense( - # x, - # kernel, - # group_sizes=group_sizes, - # contracting_dims=contracting_dims, - # # quantizer_set=quantizer_set - # ) + return out.reshape(target_out_shape) return jnp.einsum(s, x, kernel, _dot_general=dot_general, **kwargs) return wrap_function_in_te_state_module(te_einsum, quantization_recipe, "einsum")() From ab98852671870d1ebabeaf22eb65609d536ca744 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Tue, 23 Dec 2025 11:26:56 -0800 Subject: [PATCH 28/98] grouped quantization working for single gpu --- transformer_engine/jax/flax/module.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/transformer_engine/jax/flax/module.py b/transformer_engine/jax/flax/module.py index 733eaf513b..cc6088e8d2 100644 --- a/transformer_engine/jax/flax/module.py +++ b/transformer_engine/jax/flax/module.py @@ -377,6 +377,7 @@ def generate_quantizer_set( variable_collection: str = None, quantization_checkpoint_name: Optional[str] = None, fp8_recipe=None, + n_groups: int = None, ): """ Generate a set of FP8 meta for a GEMM. @@ -409,6 +410,7 @@ def generate_quantizer_set( fp8_recipe=fp8_recipe, quantize_meta_set=quantize_meta_set, checkpoint_name=quantization_checkpoint_name, + n_groups=n_groups, ) return quantizer_set @@ -1379,12 +1381,13 @@ def wrap_function_in_te_state_module(f, quantization_recipe, name: Optional[str] class TEWrapper(te.flax.module.TransformerEngineBase): """Wrapper Flax module for TransformerEngine quantization support.""" - def generate_quantizer_set(self, postfix: str = ""): + def generate_quantizer_set(self, postfix: str = "", n_groups: int = None): OVERWRITE_WITH_GRADIENT = "_overwrite_with_gradient" return super().generate_quantizer_set( postfix=postfix, variable_collection=OVERWRITE_WITH_GRADIENT, fp8_recipe=quantization_recipe, + n_groups=n_groups, ) @nn.compact @@ -1456,8 +1459,6 @@ def dot_general(x, kernel, dims, *args, **kwargs): return jax.lax.dot_general(x, kernel, dims, *args, **kwargs) target_out_shape = jax.lax.dot_general(x, kernel, dims).shape - # TODO: add num groups to make grouped quantizer set - quantizer_set = generate_quantizer_set() if x.dtype not in [jnp.float16, jnp.bfloat16, jnp.float32, jnp.float64]: # HACK: because x input is bool for dispatch mask @@ -1496,6 +1497,8 @@ def reorder_rhs_for_grouped_gemm(tensor, bdims, cdims): group_sizes = jnp.array([group_size]*num_groups, dtype=jnp.int32) + quantizer_set = generate_quantizer_set(n_groups=num_groups) + print(f'{group_sizes=}, {contracting_dims=}, {x.shape=}, {kernel.shape=}, {contracting_dims=}') contracting_dims = ( @@ -1509,7 +1512,7 @@ def reorder_rhs_for_grouped_gemm(tensor, bdims, cdims): kernel, group_sizes=group_sizes, contracting_dims=contracting_dims, - # quantizer_set=quantizer_set + quantizer_set=quantizer_set ) return out.reshape(target_out_shape) return jnp.einsum(s, x, kernel, _dot_general=dot_general, **kwargs) From ed540c8e5566d46f2ddb645fdd3940ff94d310c3 Mon Sep 17 00:00:00 2001 From: Pawel Gadzinski Date: Tue, 30 Dec 2025 09:19:40 +0000 Subject: [PATCH 29/98] fixes Signed-off-by: Pawel Gadzinski --- 3rdparty/cudnn-frontend | 2 +- tests/cpp/operator/test_grouped_gemm.cu | 6 +- transformer_engine/common/gemm/config.cpp | 86 +++++++++++ transformer_engine/common/gemm/config.h | 19 +++ .../common/gemm/cublaslt_gemm.cu | 1 - .../common/gemm/cublaslt_grouped_gemm.cu | 23 +-- .../common/include/transformer_engine/gemm.h | 134 ++++++++++++++++-- 7 files changed, 245 insertions(+), 26 deletions(-) diff --git a/3rdparty/cudnn-frontend b/3rdparty/cudnn-frontend index be6c079be8..0258951d4d 160000 --- a/3rdparty/cudnn-frontend +++ b/3rdparty/cudnn-frontend @@ -1 +1 @@ -Subproject commit be6c079be8aaffa0fc079fcf039887e637c289c7 +Subproject commit 0258951d4d512f4714eb1574496f4d57669b1b93 diff --git a/tests/cpp/operator/test_grouped_gemm.cu b/tests/cpp/operator/test_grouped_gemm.cu index ada6980858..4d6e1b7bb9 100644 --- a/tests/cpp/operator/test_grouped_gemm.cu +++ b/tests/cpp/operator/test_grouped_gemm.cu @@ -457,10 +457,8 @@ void run_grouped_gemm_case(const TestParams& params) { grouped_D.get_handle(), setup_ws.data(), cublas_ws.data(), - 0, - nullptr, - nullptr, - nullptr); + nullptr, // config (use defaults) + 0); for (size_t i = 0; i < num_gemms; ++i) { Tensor grouped_split("grouped_D" + std::to_string(i), diff --git a/transformer_engine/common/gemm/config.cpp b/transformer_engine/common/gemm/config.cpp index cf211beaf9..bf0b7bc2bf 100644 --- a/transformer_engine/common/gemm/config.cpp +++ b/transformer_engine/common/gemm/config.cpp @@ -114,3 +114,89 @@ void nvte_destroy_matmul_config(NVTEMatmulConfig config) { delete reinterpret_cast(config); } } + +NVTEGroupedMatmulConfig nvte_create_grouped_matmul_config() { + return new transformer_engine::GroupedMatmulConfig; +} + +void nvte_get_grouped_matmul_config_attribute(NVTEGroupedMatmulConfig config, + NVTEGroupedMatmulConfigAttribute attr, void *buf, + size_t size_in_bytes, size_t *size_written) { + // Write attribute size + NVTE_CHECK(attr < kNVTEGroupedMatmulConfigNumAttributes, + "Invalid NVTEGroupedMatmulConfigAttribute (got ", static_cast(attr), ")"); + NVTE_CHECK(size_written != nullptr, "Invalid size_written (got NULL)"); + const auto &attr_size = transformer_engine::GroupedMatmulConfig::attr_sizes[attr]; + *size_written = attr_size; + + // Return immediately if buffer is not provided + if (buf == nullptr) { + return; + } + + // Check buffer size + NVTE_CHECK(size_in_bytes >= attr_size, + "Buffer is too small for grouped matmul config attribute " + "(attribute ", + static_cast(attr), " needs ", attr_size, " bytes, but buffer has ", size_in_bytes, + " bytes)"); + + // Write to buffer + NVTE_CHECK(config != nullptr, "Invalid NVTEGroupedMatmulConfig (got NULL)"); + const auto &config_ = + *reinterpret_cast(config); + switch (attr) { + case kNVTEGroupedMatmulConfigAvgM: + std::memcpy(buf, &config_.avg_m, attr_size); + break; + case kNVTEGroupedMatmulConfigAvgN: + std::memcpy(buf, &config_.avg_n, attr_size); + break; + case kNVTEGroupedMatmulConfigAvgK: + std::memcpy(buf, &config_.avg_k, attr_size); + break; + default: + NVTE_ERROR("Unsupported NVTEGroupedMatmulConfigAttribute (got ", static_cast(attr), ")"); + } +} + +void nvte_set_grouped_matmul_config_attribute(NVTEGroupedMatmulConfig config, + NVTEGroupedMatmulConfigAttribute attr, + const void *buf, size_t size_in_bytes) { + // Check attribute and buffer + NVTE_CHECK(attr < kNVTEGroupedMatmulConfigNumAttributes, + "Invalid NVTEGroupedMatmulConfigAttribute (got ", static_cast(attr), ")"); + const auto &attr_size = transformer_engine::GroupedMatmulConfig::attr_sizes[attr]; + NVTE_CHECK(size_in_bytes >= attr_size, + "Buffer is too small for grouped matmul config attribute " + "(attribute ", + static_cast(attr), " needs ", attr_size, " bytes, but buffer has ", size_in_bytes, + " bytes)"); + NVTE_CHECK(buf != nullptr, "Invalid buffer (got NULL)"); + + // Read from buffer + NVTE_CHECK(config != nullptr, "Invalid NVTEGroupedMatmulConfig (got NULL)"); + auto &config_ = *reinterpret_cast(config); + switch (attr) { + case kNVTEGroupedMatmulConfigAvgM: + std::memcpy(&config_.avg_m, buf, attr_size); + config_.avg_m_set = true; + break; + case kNVTEGroupedMatmulConfigAvgN: + std::memcpy(&config_.avg_n, buf, attr_size); + config_.avg_n_set = true; + break; + case kNVTEGroupedMatmulConfigAvgK: + std::memcpy(&config_.avg_k, buf, attr_size); + config_.avg_k_set = true; + break; + default: + NVTE_ERROR("Unsupported NVTEGroupedMatmulConfigAttribute (got ", static_cast(attr), ")"); + } +} + +void nvte_destroy_grouped_matmul_config(NVTEGroupedMatmulConfig config) { + if (config != nullptr) { + delete reinterpret_cast(config); + } +} diff --git a/transformer_engine/common/gemm/config.h b/transformer_engine/common/gemm/config.h index 54ccf06a53..4f93ff7fbc 100644 --- a/transformer_engine/common/gemm/config.h +++ b/transformer_engine/common/gemm/config.h @@ -31,6 +31,25 @@ struct MatmulConfig { }; }; +struct GroupedMatmulConfig { + // Average dimension hints for cuBLASLt algorithm selection heuristics. + // Value of 0 means "not set" - compute automatically from tensor shapes. + int64_t avg_m = 0; + int64_t avg_n = 0; + int64_t avg_k = 0; + + // Track which attributes have been explicitly set + bool avg_m_set = false; + bool avg_n_set = false; + bool avg_k_set = false; + + static constexpr size_t attr_sizes[] = { + sizeof(int64_t), // avg_m + sizeof(int64_t), // avg_n + sizeof(int64_t) // avg_k + }; +}; + } // namespace transformer_engine #endif // TRANSFORMER_ENGINE_GEMM_CONFIG_H_ diff --git a/transformer_engine/common/gemm/cublaslt_gemm.cu b/transformer_engine/common/gemm/cublaslt_gemm.cu index 1d931da4aa..118bf19335 100644 --- a/transformer_engine/common/gemm/cublaslt_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_gemm.cu @@ -23,7 +23,6 @@ #include "../util/logging.h" #include "../util/multi_stream.h" #include "./config.h" -#include "./cublaslt_grouped_gemm.cuh" #include "./cutlass_grouped_gemm.cuh" namespace { diff --git a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu index 40180fe760..03692bf052 100644 --- a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu @@ -16,6 +16,7 @@ #include "../util/cuda_runtime.h" #include "../util/handle_manager.h" #include "../util/logging.h" +#include "./config.h" #include "./cublaslt_grouped_gemm.cuh" namespace { @@ -498,8 +499,7 @@ inline size_t grouped_gemm_setup_workspace_size(size_t num_tensors) { void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, const NVTEGroupedTensor A, const NVTEGroupedTensor B, const NVTETensor beta, const NVTEGroupedTensor C, NVTEGroupedTensor D, NVTETensor workspace_setup, NVTETensor workspace_cublas, - cudaStream_t stream, const int64_t *avg_m, const int64_t *avg_n, - const int64_t *avg_k) { + NVTEGroupedMatmulConfig config, cudaStream_t stream) { NVTE_API_CALL(nvte_grouped_gemm); using namespace transformer_engine; @@ -518,6 +518,12 @@ void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, const NVT Tensor *wspace_setup = convertNVTETensor(workspace_setup); Tensor *wspace_cublas = convertNVTETensor(workspace_cublas); + // Parse config (if provided) + GroupedMatmulConfig config_; + if (config != nullptr) { + config_ = *reinterpret_cast(config); + } + // Validate inputs and num_tensors validate_grouped_gemm_inputs(inputA, inputB, inputC_raw, outputD, alpha_tensor, beta_tensor); @@ -564,11 +570,11 @@ void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, const NVT // Compute average dimensions for heuristics // K dimension: if transa, K is A's first dim; if not, K is A's last dim - int64_t avg_m_val = avg_m ? *avg_m : compute_avg_first_dim(outputD); - int64_t avg_n_val = avg_n ? *avg_n : compute_avg_last_dim(outputD); - int64_t avg_k_val = avg_k ? *avg_k - : (A_sel.trans ? compute_avg_first_dim(A_sel.tensor) - : compute_avg_last_dim(A_sel.tensor)); + int64_t avg_m_val = config_.avg_m_set ? config_.avg_m : compute_avg_first_dim(outputD); + int64_t avg_n_val = config_.avg_n_set ? config_.avg_n : compute_avg_last_dim(outputD); + int64_t avg_k_val = config_.avg_k_set ? config_.avg_k + : (A_sel.trans ? compute_avg_first_dim(A_sel.tensor) + : compute_avg_last_dim(A_sel.tensor)); // Heuristic selection cublasLtMatmulAlgo_t algo = select_grouped_gemm_algo(handle, matmulDesc, descA, descB, descC, @@ -587,8 +593,7 @@ void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, const NVT void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, const NVTEGroupedTensor A, const NVTEGroupedTensor B, const NVTETensor beta, const NVTEGroupedTensor C, NVTEGroupedTensor D, NVTETensor workspace_setup, NVTETensor workspace_cublas, - cudaStream_t stream, const int64_t *avg_m, const int64_t *avg_n, - const int64_t *avg_k) { + NVTEGroupedMatmulConfig config, cudaStream_t stream) { NVTE_ERROR("nvte_grouped_gemm requires cuBLAS 13.1+, but compile-time cuBLAS version is ", CUBLAS_VERSION, ". Please upgrade to CUDA 13.1 or newer."); } diff --git a/transformer_engine/common/include/transformer_engine/gemm.h b/transformer_engine/common/include/transformer_engine/gemm.h index f4c60ca3fe..00fd0b7048 100644 --- a/transformer_engine/common/include/transformer_engine/gemm.h +++ b/transformer_engine/common/include/transformer_engine/gemm.h @@ -22,6 +22,9 @@ extern "C" { /*! \brief Configuration for matrix multiplication. */ typedef void *NVTEMatmulConfig; +/*! \brief Configuration for grouped matrix multiplication. */ +typedef void *NVTEGroupedMatmulConfig; + /*! \enum NVTEMatmulConfigAttribute * \brief Type of option for matrix multiplication. */ @@ -54,6 +57,34 @@ enum NVTEMatmulConfigAttribute { kNVTEMatmulConfigNumAttributes }; +/*! \enum NVTEGroupedMatmulConfigAttribute + * \brief Type of option for grouped matrix multiplication. + */ +enum NVTEGroupedMatmulConfigAttribute { + /*! Average M dimension hint + * + * Optional hint for average M dimension across all matrices in the group. + * Used by cuBLASLt for algorithm selection heuristics. If not set, + * computed automatically from D's logical shape. + */ + kNVTEGroupedMatmulConfigAvgM = 0, + /*! Average N dimension hint + * + * Optional hint for average N dimension across all matrices in the group. + * Used by cuBLASLt for algorithm selection heuristics. If not set, + * computed automatically from D's logical shape. + */ + kNVTEGroupedMatmulConfigAvgN = 1, + /*! Average K (reduction) dimension hint + * + * Optional hint for average K dimension across all matrices in the group. + * Used by cuBLASLt for algorithm selection heuristics. If not set, + * computed automatically from A's logical shape. + */ + kNVTEGroupedMatmulConfigAvgK = 2, + kNVTEGroupedMatmulConfigNumAttributes +}; + /*! \brief Create a matrix multiplication configuration. */ NVTEMatmulConfig nvte_create_matmul_config(); @@ -84,6 +115,38 @@ void nvte_set_matmul_config_attribute(NVTEMatmulConfig config, NVTEMatmulConfigA /*! \brief Destroy a matrix multiplication configuration. */ void nvte_destroy_matmul_config(NVTEMatmulConfig config); +/*! \brief Create a grouped matrix multiplication configuration. */ +NVTEGroupedMatmulConfig nvte_create_grouped_matmul_config(); + +/*! \brief Query an option in grouped matrix multiplication configuration. + * + * \param[in] config Grouped matrix multiplication configuration. + * \param[in] attr Option type. + * \param[out] buf Memory address to write option value. Ignored if + * NULL. + * \param[in] size_in_bytes Size of buf. + * \param[out] size_written Number of bytes that have been written to + * buf. If buf is NULL, then the number of + * bytes that would have been written. + */ +void nvte_get_grouped_matmul_config_attribute(NVTEGroupedMatmulConfig config, + NVTEGroupedMatmulConfigAttribute attr, void *buf, + size_t size_in_bytes, size_t *size_written); + +/*! \brief Set an option in grouped matrix multiplication configuration. + * + * \param[in] config Grouped matrix multiplication configuration. + * \param[in] attr Option type. + * \param[out] buf Memory address to read option value. + * \param[in] size_in_bytes Size of buf. + */ +void nvte_set_grouped_matmul_config_attribute(NVTEGroupedMatmulConfig config, + NVTEGroupedMatmulConfigAttribute attr, + const void *buf, size_t size_in_bytes); + +/*! \brief Destroy a grouped matrix multiplication configuration. */ +void nvte_destroy_grouped_matmul_config(NVTEGroupedMatmulConfig config); + /*! \brief Compute matrix multiplication of 2 matrices, potentially fused with other operations (deprecated). * * This has been deprecated in favor of nvte_cublas_gemm_v2. @@ -253,16 +316,8 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor * \param[out] D Output grouped tensor D. * \param[in] workspace_setup Workspace tensor for pointer array setup. * \param[in] workspace_cublas Workspace tensor for cuBLAS operations. + * \param[in] config Additional configuration (can be NULL for defaults). * \param[in] stream CUDA stream for the operation. - * \param[in] avg_m Optional hint for average M dimension across all matrices in the - * group. Used by cuBLASLt for algorithm selection heuristics. - * If NULL, computed automatically from D's logical shape. - * \param[in] avg_n Optional hint for average N dimension across all matrices in the - * group. Used by cuBLASLt for algorithm selection heuristics. - * If NULL, computed automatically from D's logical shape. - * \param[in] avg_k Optional hint for average K (reduction) dimension across all - * matrices in the group. Used by cuBLASLt for algorithm selection - * heuristics. If NULL, computed automatically from A's logical shape. * * Requirements: * - cuBLAS 13.1+ (CUDA 13.1+) @@ -275,8 +330,7 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, const NVTEGroupedTensor A, const NVTEGroupedTensor B, const NVTETensor beta, const NVTEGroupedTensor C, NVTEGroupedTensor D, NVTETensor workspace_setup, NVTETensor workspace_cublas, - cudaStream_t stream, const int64_t *avg_m, const int64_t *avg_n, - const int64_t *avg_k); + NVTEGroupedMatmulConfig config, cudaStream_t stream); #ifdef __cplusplus } // extern "C" @@ -376,6 +430,64 @@ class MatmulConfigWrapper { NVTEMatmulConfig config_ = nullptr; }; +/*! \struct GroupedMatmulConfigWrapper + * \brief C++ wrapper for NVTEGroupedMatmulConfig. + */ +class GroupedMatmulConfigWrapper { + public: + GroupedMatmulConfigWrapper() : config_{nvte_create_grouped_matmul_config()} {} + + GroupedMatmulConfigWrapper(const GroupedMatmulConfigWrapper &) = delete; + GroupedMatmulConfigWrapper &operator=(const GroupedMatmulConfigWrapper &) = delete; + + GroupedMatmulConfigWrapper(GroupedMatmulConfigWrapper &&other) : config_{other.config_} { + other.config_ = nullptr; + } + GroupedMatmulConfigWrapper &operator=(GroupedMatmulConfigWrapper &&other) { + if (config_ != nullptr) { + nvte_destroy_grouped_matmul_config(config_); + } + config_ = other.config_; + other.config_ = nullptr; + return *this; + } + + ~GroupedMatmulConfigWrapper() { + if (config_ != nullptr) { + nvte_destroy_grouped_matmul_config(config_); + config_ = nullptr; + } + } + + /*! \brief Get the underlying NVTEGroupedMatmulConfig. + * + * \return NVTEGroupedMatmulConfig held by this GroupedMatmulConfigWrapper. + */ + operator NVTEGroupedMatmulConfig() const noexcept { return config_; } + + /*! \brief Set average M dimension hint for algorithm selection. */ + void set_avg_m(int64_t avg_m) { + nvte_set_grouped_matmul_config_attribute(config_, kNVTEGroupedMatmulConfigAvgM, &avg_m, + sizeof(int64_t)); + } + + /*! \brief Set average N dimension hint for algorithm selection. */ + void set_avg_n(int64_t avg_n) { + nvte_set_grouped_matmul_config_attribute(config_, kNVTEGroupedMatmulConfigAvgN, &avg_n, + sizeof(int64_t)); + } + + /*! \brief Set average K dimension hint for algorithm selection. */ + void set_avg_k(int64_t avg_k) { + nvte_set_grouped_matmul_config_attribute(config_, kNVTEGroupedMatmulConfigAvgK, &avg_k, + sizeof(int64_t)); + } + + private: + /*! \brief Wrapped NVTEGroupedMatmulConfig. */ + NVTEGroupedMatmulConfig config_ = nullptr; +}; + } // namespace transformer_engine #endif // __cplusplus From 359a9f548fcc8d7089f7cab9af824976f4aac120 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 30 Dec 2025 09:31:37 +0000 Subject: [PATCH 30/98] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- transformer_engine/common/gemm/config.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/transformer_engine/common/gemm/config.cpp b/transformer_engine/common/gemm/config.cpp index bf0b7bc2bf..5c1a899d59 100644 --- a/transformer_engine/common/gemm/config.cpp +++ b/transformer_engine/common/gemm/config.cpp @@ -143,8 +143,7 @@ void nvte_get_grouped_matmul_config_attribute(NVTEGroupedMatmulConfig config, // Write to buffer NVTE_CHECK(config != nullptr, "Invalid NVTEGroupedMatmulConfig (got NULL)"); - const auto &config_ = - *reinterpret_cast(config); + const auto &config_ = *reinterpret_cast(config); switch (attr) { case kNVTEGroupedMatmulConfigAvgM: std::memcpy(buf, &config_.avg_m, attr_size); From a702426f1bddc0b4b1e2d0ce5dd808a19c039174 Mon Sep 17 00:00:00 2001 From: Pawel Gadzinski Date: Tue, 30 Dec 2025 12:08:30 +0100 Subject: [PATCH 31/98] fixes Signed-off-by: Pawel Gadzinski --- tests/cpp/operator/test_grouped_gemm.cu | 50 ++++++++++++------- transformer_engine/common/gemm/config.cpp | 12 +++++ transformer_engine/common/gemm/config.h | 12 ++++- .../common/gemm/cublaslt_grouped_gemm.cu | 9 ++++ .../common/include/transformer_engine/gemm.h | 16 ++++++ 5 files changed, 81 insertions(+), 18 deletions(-) diff --git a/tests/cpp/operator/test_grouped_gemm.cu b/tests/cpp/operator/test_grouped_gemm.cu index 4d6e1b7bb9..1a85e54f82 100644 --- a/tests/cpp/operator/test_grouped_gemm.cu +++ b/tests/cpp/operator/test_grouped_gemm.cu @@ -293,7 +293,8 @@ struct TestParams { bool transa; bool transb; ShapeCase shape_case; - bool use_null_c = false; // When true, pass nullptr for C (valid when beta=0) + bool use_null_c = false; // When true, pass nullptr for C (valid when beta=0) + bool use_split_accumulator = false; // Whether to use split accumulator for FP8 GEMM }; // Returns a vector of (M, N, K) tuples for each GEMM in the group. @@ -362,8 +363,6 @@ void run_grouped_gemm_case(const TestParams& params) { std::vector A_ptrs(num_gemms); std::vector B_ptrs(num_gemms); std::vector D_ptrs(num_gemms); - std::vector bias_ptrs(num_gemms, nullptr); - std::vector gelu_ptrs(num_gemms, nullptr); std::vector workspaces(num_gemms); std::vector workspace_ptrs(num_gemms, nullptr); std::vector A_views; @@ -371,6 +370,10 @@ void run_grouped_gemm_case(const TestParams& params) { A_views.reserve(num_gemms); B_views.reserve(num_gemms); + // Empty bias/gelu arrays for nvte_multi_tensor_gemm (no epilogues) + std::vector bias_ptrs(num_gemms, nullptr); + std::vector gelu_ptrs(num_gemms, nullptr); + const size_t cublas_ws_bytes = 32ull * 1024 * 1024; for (size_t i = 0; i < num_gemms; ++i) { @@ -391,11 +394,11 @@ void run_grouped_gemm_case(const TestParams& params) { static_cast(num_gemms), params.transa, params.transb, - false, + false, // grad workspace_ptrs.data(), - false, - false, - 0, + false, // accumulate + params.use_split_accumulator, + 0, // sm_count 0); GroupedBuffers grouped_A = build_grouped_tensor(A_views, A_tensors[0].scaling_mode()); @@ -447,6 +450,10 @@ void run_grouped_gemm_case(const TestParams& params) { Tensor setup_ws("setup_ws", std::vector{setup_ws_bytes}, DType::kByte); Tensor cublas_ws("cublas_ws", std::vector{cublas_ws_bytes}, DType::kByte); + // Create config with use_split_accumulator setting + transformer_engine::GroupedMatmulConfigWrapper config; + config.set_use_split_accumulator(params.use_split_accumulator); + nvte_grouped_gemm(params.transa, params.transb, alpha_tensor.data(), @@ -457,7 +464,7 @@ void run_grouped_gemm_case(const TestParams& params) { grouped_D.get_handle(), setup_ws.data(), cublas_ws.data(), - nullptr, // config (use defaults) + config, 0); for (size_t i = 0; i < num_gemms; ++i) { @@ -495,20 +502,29 @@ std::string MakeGroupedGemmTestName(const testing::TestParamInfo(info.param.input_case)]) + "_" + - kShapeNames[static_cast(info.param.shape_case)] + "_" + layout + null_c; + kShapeNames[static_cast(info.param.shape_case)] + "_" + layout + null_c + split_acc; } +// TestParams: {input_case, transa, transb, shape_case, use_null_c, use_split_accumulator} const std::vector kTestParams = { - {InputCase::kFP8Current, true, false, ShapeCase::kAllDifferent, false}, - {InputCase::kFP8Current, false, true, ShapeCase::kAllDifferent, false}, - {InputCase::kFP8Current, false, false, ShapeCase::kAllSame, false}, - {InputCase::kBF16, true, false, ShapeCase::kSameFirst, false}, - {InputCase::kBF16, false, true, ShapeCase::kSameLast, false}, - {InputCase::kBF16, false, false, ShapeCase::kAllSame, false}, - {InputCase::kBF16, true, true, ShapeCase::kAllDifferent, false}, + // Basic tests (no split accumulator) + {InputCase::kFP8Current, true, false, ShapeCase::kAllDifferent, false, false}, + {InputCase::kFP8Current, false, true, ShapeCase::kAllDifferent, false, false}, + {InputCase::kFP8Current, false, false, ShapeCase::kAllSame, false, false}, + {InputCase::kBF16, true, false, ShapeCase::kSameFirst, false, false}, + {InputCase::kBF16, false, true, ShapeCase::kSameLast, false, false}, + {InputCase::kBF16, false, false, ShapeCase::kAllSame, false, false}, + {InputCase::kBF16, true, true, ShapeCase::kAllDifferent, false, false}, // Test NULL C (valid when beta=0) - {InputCase::kBF16, false, false, ShapeCase::kAllSame, true}, + {InputCase::kBF16, false, false, ShapeCase::kAllSame, true, false}, + + // Split accumulator tests + {InputCase::kFP8Current, true, false, ShapeCase::kAllDifferent, false, true}, + {InputCase::kFP8Current, false, true, ShapeCase::kAllDifferent, false, true}, + {InputCase::kFP8Current, false, false, ShapeCase::kAllSame, false, true}, + {InputCase::kFP8Current, true, false, ShapeCase::kSameFirst, false, true}, }; INSTANTIATE_TEST_SUITE_P(OperatorTest, diff --git a/transformer_engine/common/gemm/config.cpp b/transformer_engine/common/gemm/config.cpp index 5c1a899d59..2c7fc38129 100644 --- a/transformer_engine/common/gemm/config.cpp +++ b/transformer_engine/common/gemm/config.cpp @@ -154,6 +154,12 @@ void nvte_get_grouped_matmul_config_attribute(NVTEGroupedMatmulConfig config, case kNVTEGroupedMatmulConfigAvgK: std::memcpy(buf, &config_.avg_k, attr_size); break; + case kNVTEGroupedMatmulConfigUseSplitAccumulator: + std::memcpy(buf, &config_.use_split_accumulator, attr_size); + break; + case kNVTEGroupedMatmulConfigSMCount: + std::memcpy(buf, &config_.sm_count, attr_size); + break; default: NVTE_ERROR("Unsupported NVTEGroupedMatmulConfigAttribute (got ", static_cast(attr), ")"); } @@ -189,6 +195,12 @@ void nvte_set_grouped_matmul_config_attribute(NVTEGroupedMatmulConfig config, std::memcpy(&config_.avg_k, buf, attr_size); config_.avg_k_set = true; break; + case kNVTEGroupedMatmulConfigUseSplitAccumulator: + std::memcpy(&config_.use_split_accumulator, buf, attr_size); + break; + case kNVTEGroupedMatmulConfigSMCount: + std::memcpy(&config_.sm_count, buf, attr_size); + break; default: NVTE_ERROR("Unsupported NVTEGroupedMatmulConfigAttribute (got ", static_cast(attr), ")"); } diff --git a/transformer_engine/common/gemm/config.h b/transformer_engine/common/gemm/config.h index 4f93ff7fbc..012de5e059 100644 --- a/transformer_engine/common/gemm/config.h +++ b/transformer_engine/common/gemm/config.h @@ -7,6 +7,8 @@ #ifndef TRANSFORMER_ENGINE_GEMM_CONFIG_H_ #define TRANSFORMER_ENGINE_GEMM_CONFIG_H_ +#include + #include namespace transformer_engine { @@ -38,6 +40,12 @@ struct GroupedMatmulConfig { int64_t avg_n = 0; int64_t avg_k = 0; + // Whether to use split accumulator for FP8 GEMM (more accurate but slower) + bool use_split_accumulator = true; + + // Number of streaming multiprocessors to use in GEMM kernel + int sm_count = 0; + // Track which attributes have been explicitly set bool avg_m_set = false; bool avg_n_set = false; @@ -46,7 +54,9 @@ struct GroupedMatmulConfig { static constexpr size_t attr_sizes[] = { sizeof(int64_t), // avg_m sizeof(int64_t), // avg_n - sizeof(int64_t) // avg_k + sizeof(int64_t), // avg_k + sizeof(bool), // use_split_accumulator + sizeof(int) // sm_count }; }; diff --git a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu index 03692bf052..0183752b55 100644 --- a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu @@ -568,6 +568,15 @@ void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, const NVT init_matmul_desc(matmulDesc, op_A, op_B); set_fp8_scale_pointers(matmulDesc, A_sel, B_sel); + // Set fast accumulation mode for FP8 + // Fast accumulation: 0 = split accumulator (more accurate), 1 = fast accumulator + const bool is_fp8 = is_fp8_dtype(A_sel.dtype) || is_fp8_dtype(B_sel.dtype); + if (is_fp8) { + int8_t fastAccuMode = config_.use_split_accumulator ? 0 : 1; + NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + &matmulDesc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fastAccuMode, sizeof(fastAccuMode))); + } + // Compute average dimensions for heuristics // K dimension: if transa, K is A's first dim; if not, K is A's last dim int64_t avg_m_val = config_.avg_m_set ? config_.avg_m : compute_avg_first_dim(outputD); diff --git a/transformer_engine/common/include/transformer_engine/gemm.h b/transformer_engine/common/include/transformer_engine/gemm.h index 00fd0b7048..a596b77fde 100644 --- a/transformer_engine/common/include/transformer_engine/gemm.h +++ b/transformer_engine/common/include/transformer_engine/gemm.h @@ -82,6 +82,10 @@ enum NVTEGroupedMatmulConfigAttribute { * computed automatically from A's logical shape. */ kNVTEGroupedMatmulConfigAvgK = 2, + /*! Whether to use split accumulator for FP8 GEMM. */ + kNVTEGroupedMatmulConfigUseSplitAccumulator = 3, + /*! Number of streaming multiprocessors to use in GEMM kernel. */ + kNVTEGroupedMatmulConfigSMCount = 4, kNVTEGroupedMatmulConfigNumAttributes }; @@ -483,6 +487,18 @@ class GroupedMatmulConfigWrapper { sizeof(int64_t)); } + /*! \brief Set whether to use split accumulator for FP8 GEMM. */ + void set_use_split_accumulator(bool use_split_accumulator) { + nvte_set_grouped_matmul_config_attribute(config_, kNVTEGroupedMatmulConfigUseSplitAccumulator, + &use_split_accumulator, sizeof(bool)); + } + + /*! \brief Set number of streaming multiprocessors to use. */ + void set_sm_count(int sm_count) { + nvte_set_grouped_matmul_config_attribute(config_, kNVTEGroupedMatmulConfigSMCount, + &sm_count, sizeof(int)); + } + private: /*! \brief Wrapped NVTEGroupedMatmulConfig. */ NVTEGroupedMatmulConfig config_ = nullptr; From fb027d0481dd5c4165013b38d9fed53bbc3141fc Mon Sep 17 00:00:00 2001 From: Pawel Gadzinski Date: Tue, 30 Dec 2025 12:15:08 +0100 Subject: [PATCH 32/98] fix Signed-off-by: Pawel Gadzinski --- tests/cpp/operator/test_grouped_gemm.cu | 42 +++++++------------ transformer_engine/common/gemm/config.cpp | 6 --- transformer_engine/common/gemm/config.h | 4 -- .../common/gemm/cublaslt_grouped_gemm.cu | 31 ++++++++------ .../common/include/transformer_engine/gemm.h | 10 +---- 5 files changed, 33 insertions(+), 60 deletions(-) diff --git a/tests/cpp/operator/test_grouped_gemm.cu b/tests/cpp/operator/test_grouped_gemm.cu index 1a85e54f82..46add9e5e1 100644 --- a/tests/cpp/operator/test_grouped_gemm.cu +++ b/tests/cpp/operator/test_grouped_gemm.cu @@ -279,8 +279,6 @@ Tensor make_fp8_operand(const std::string& name, const std::vector& shap Tensor make_bf16_operand(const std::string& name, const std::vector& shape) { Tensor t(name, shape, DType::kBFloat16); - // Fill with ones for easier debugging - //fillUniform(&t); const size_t numel = shape[0] * shape[1]; std::vector<__nv_bfloat16> ones(numel, __float2bfloat16(1.0f)); NVTE_CHECK_CUDA(cudaMemcpy(t.rowwise_dptr(), ones.data(), @@ -293,8 +291,7 @@ struct TestParams { bool transa; bool transb; ShapeCase shape_case; - bool use_null_c = false; // When true, pass nullptr for C (valid when beta=0) - bool use_split_accumulator = false; // Whether to use split accumulator for FP8 GEMM + bool use_null_c = false; // When true, pass nullptr for C (valid when beta=0) }; // Returns a vector of (M, N, K) tuples for each GEMM in the group. @@ -397,7 +394,7 @@ void run_grouped_gemm_case(const TestParams& params) { false, // grad workspace_ptrs.data(), false, // accumulate - params.use_split_accumulator, + false, // use_split_accumulator 0, // sm_count 0); @@ -450,10 +447,6 @@ void run_grouped_gemm_case(const TestParams& params) { Tensor setup_ws("setup_ws", std::vector{setup_ws_bytes}, DType::kByte); Tensor cublas_ws("cublas_ws", std::vector{cublas_ws_bytes}, DType::kByte); - // Create config with use_split_accumulator setting - transformer_engine::GroupedMatmulConfigWrapper config; - config.set_use_split_accumulator(params.use_split_accumulator); - nvte_grouped_gemm(params.transa, params.transb, alpha_tensor.data(), @@ -464,7 +457,7 @@ void run_grouped_gemm_case(const TestParams& params) { grouped_D.get_handle(), setup_ws.data(), cublas_ws.data(), - config, + nullptr, // config (use defaults) 0); for (size_t i = 0; i < num_gemms; ++i) { @@ -502,29 +495,22 @@ std::string MakeGroupedGemmTestName(const testing::TestParamInfo(info.param.input_case)]) + "_" + - kShapeNames[static_cast(info.param.shape_case)] + "_" + layout + null_c + split_acc; + kShapeNames[static_cast(info.param.shape_case)] + "_" + layout + null_c; } -// TestParams: {input_case, transa, transb, shape_case, use_null_c, use_split_accumulator} +// TestParams: {input_case, transa, transb, shape_case, use_null_c} const std::vector kTestParams = { - // Basic tests (no split accumulator) - {InputCase::kFP8Current, true, false, ShapeCase::kAllDifferent, false, false}, - {InputCase::kFP8Current, false, true, ShapeCase::kAllDifferent, false, false}, - {InputCase::kFP8Current, false, false, ShapeCase::kAllSame, false, false}, - {InputCase::kBF16, true, false, ShapeCase::kSameFirst, false, false}, - {InputCase::kBF16, false, true, ShapeCase::kSameLast, false, false}, - {InputCase::kBF16, false, false, ShapeCase::kAllSame, false, false}, - {InputCase::kBF16, true, true, ShapeCase::kAllDifferent, false, false}, + // Basic tests + {InputCase::kFP8Current, true, false, ShapeCase::kAllDifferent, false}, + {InputCase::kFP8Current, false, true, ShapeCase::kAllDifferent, false}, + {InputCase::kFP8Current, false, false, ShapeCase::kAllSame, false}, + {InputCase::kBF16, true, false, ShapeCase::kSameFirst, false}, + {InputCase::kBF16, false, true, ShapeCase::kSameLast, false}, + {InputCase::kBF16, false, false, ShapeCase::kAllSame, false}, + {InputCase::kBF16, true, true, ShapeCase::kAllDifferent, false}, // Test NULL C (valid when beta=0) - {InputCase::kBF16, false, false, ShapeCase::kAllSame, true, false}, - - // Split accumulator tests - {InputCase::kFP8Current, true, false, ShapeCase::kAllDifferent, false, true}, - {InputCase::kFP8Current, false, true, ShapeCase::kAllDifferent, false, true}, - {InputCase::kFP8Current, false, false, ShapeCase::kAllSame, false, true}, - {InputCase::kFP8Current, true, false, ShapeCase::kSameFirst, false, true}, + {InputCase::kBF16, false, false, ShapeCase::kAllSame, true}, }; INSTANTIATE_TEST_SUITE_P(OperatorTest, diff --git a/transformer_engine/common/gemm/config.cpp b/transformer_engine/common/gemm/config.cpp index 2c7fc38129..c305ce033d 100644 --- a/transformer_engine/common/gemm/config.cpp +++ b/transformer_engine/common/gemm/config.cpp @@ -154,9 +154,6 @@ void nvte_get_grouped_matmul_config_attribute(NVTEGroupedMatmulConfig config, case kNVTEGroupedMatmulConfigAvgK: std::memcpy(buf, &config_.avg_k, attr_size); break; - case kNVTEGroupedMatmulConfigUseSplitAccumulator: - std::memcpy(buf, &config_.use_split_accumulator, attr_size); - break; case kNVTEGroupedMatmulConfigSMCount: std::memcpy(buf, &config_.sm_count, attr_size); break; @@ -195,9 +192,6 @@ void nvte_set_grouped_matmul_config_attribute(NVTEGroupedMatmulConfig config, std::memcpy(&config_.avg_k, buf, attr_size); config_.avg_k_set = true; break; - case kNVTEGroupedMatmulConfigUseSplitAccumulator: - std::memcpy(&config_.use_split_accumulator, buf, attr_size); - break; case kNVTEGroupedMatmulConfigSMCount: std::memcpy(&config_.sm_count, buf, attr_size); break; diff --git a/transformer_engine/common/gemm/config.h b/transformer_engine/common/gemm/config.h index 012de5e059..fd9b1266e6 100644 --- a/transformer_engine/common/gemm/config.h +++ b/transformer_engine/common/gemm/config.h @@ -40,9 +40,6 @@ struct GroupedMatmulConfig { int64_t avg_n = 0; int64_t avg_k = 0; - // Whether to use split accumulator for FP8 GEMM (more accurate but slower) - bool use_split_accumulator = true; - // Number of streaming multiprocessors to use in GEMM kernel int sm_count = 0; @@ -55,7 +52,6 @@ struct GroupedMatmulConfig { sizeof(int64_t), // avg_m sizeof(int64_t), // avg_n sizeof(int64_t), // avg_k - sizeof(bool), // use_split_accumulator sizeof(int) // sm_count }; }; diff --git a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu index 0183752b55..98c78a304d 100644 --- a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu @@ -310,15 +310,17 @@ inline void init_matrix_layouts(cublasLtMatrixLayoutOpaque_t &descA, // For column-major layout: leading dimension is the number of rows in storage. // If columnwise data was chosen, storage is already transposed. - int *rowa = A_sel.use_columnwise ? ws.M : (A_sel.trans ? ws.K : ws.M); - int *cola = A_sel.use_columnwise ? ws.K : (A_sel.trans ? ws.M : ws.K); - int *lda = rowa; - int *rowb = B_sel.use_columnwise ? ws.N : (B_sel.trans ? ws.N : ws.K); - int *colb = B_sel.use_columnwise ? ws.K : (B_sel.trans ? ws.K : ws.N); - int *ldb = rowb; - - NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descA, A_type, num_tensors, rowa, cola, lda)); - NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descB, B_type, num_tensors, rowb, colb, ldb)); + // Storage dimensions for A: rows_A x cols_A with leading dimension lda_storage + int *rows_A = A_sel.use_columnwise ? ws.M : (A_sel.trans ? ws.K : ws.M); + int *cols_A = A_sel.use_columnwise ? ws.K : (A_sel.trans ? ws.M : ws.K); + int *lda_storage = rows_A; + // Storage dimensions for B: rows_B x cols_B with leading dimension ldb_storage + int *rows_B = B_sel.use_columnwise ? ws.N : (B_sel.trans ? ws.N : ws.K); + int *cols_B = B_sel.use_columnwise ? ws.K : (B_sel.trans ? ws.K : ws.N); + int *ldb_storage = rows_B; + + NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descA, A_type, num_tensors, rows_A, cols_A, lda_storage)); + NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descB, B_type, num_tensors, rows_B, cols_B, ldb_storage)); NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descC, D_type, num_tensors, ws.M, ws.N, ws.M)); NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descD, D_type, num_tensors, ws.M, ws.N, ws.M)); } @@ -442,14 +444,15 @@ __global__ void setup_grouped_gemm_kernel( D_meta.offsets ? D_meta.offsets[idx] : (idx * D_meta.uniform_first * D_meta.uniform_last); // Compute data pointers + // Note: const_cast is safe here - cuBLAS requires void** but won't modify A/B/C data A_ptrs[idx] = const_cast(a_base) + a_offset * a_elem_size; B_ptrs[idx] = const_cast(b_base) + b_offset * b_elem_size; C_ptrs[idx] = const_cast(c_base) + c_offset * c_elem_size; D_ptrs[idx] = d_base + d_offset * d_elem_size; - // Compute M, N, K dimensions - // Test stores A as {K,M} when !transa, {M,K} when transa - // Test stores B as {N,K} when !transb, {K,N} when transb + // Compute M, N, K dimensions from tensor shapes + // Input A is stored as {K,M} when !transa, {M,K} when transa + // Input B is stored as {N,K} when !transb, {K,N} when transb M[idx] = static_cast(transa ? a_first : a_last); K[idx] = static_cast(transa ? a_last : a_first); N[idx] = static_cast(transb ? b_last : b_first); @@ -570,9 +573,11 @@ void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, const NVT // Set fast accumulation mode for FP8 // Fast accumulation: 0 = split accumulator (more accurate), 1 = fast accumulator + // Note: cuBLASLt grouped GEMM API does not support configurable split accumulator, + // we always use fast accumulator for performance. const bool is_fp8 = is_fp8_dtype(A_sel.dtype) || is_fp8_dtype(B_sel.dtype); if (is_fp8) { - int8_t fastAccuMode = config_.use_split_accumulator ? 0 : 1; + int8_t fastAccuMode = 1; // Always use fast accumulator NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( &matmulDesc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fastAccuMode, sizeof(fastAccuMode))); } diff --git a/transformer_engine/common/include/transformer_engine/gemm.h b/transformer_engine/common/include/transformer_engine/gemm.h index a596b77fde..1311021185 100644 --- a/transformer_engine/common/include/transformer_engine/gemm.h +++ b/transformer_engine/common/include/transformer_engine/gemm.h @@ -82,10 +82,8 @@ enum NVTEGroupedMatmulConfigAttribute { * computed automatically from A's logical shape. */ kNVTEGroupedMatmulConfigAvgK = 2, - /*! Whether to use split accumulator for FP8 GEMM. */ - kNVTEGroupedMatmulConfigUseSplitAccumulator = 3, /*! Number of streaming multiprocessors to use in GEMM kernel. */ - kNVTEGroupedMatmulConfigSMCount = 4, + kNVTEGroupedMatmulConfigSMCount = 3, kNVTEGroupedMatmulConfigNumAttributes }; @@ -487,12 +485,6 @@ class GroupedMatmulConfigWrapper { sizeof(int64_t)); } - /*! \brief Set whether to use split accumulator for FP8 GEMM. */ - void set_use_split_accumulator(bool use_split_accumulator) { - nvte_set_grouped_matmul_config_attribute(config_, kNVTEGroupedMatmulConfigUseSplitAccumulator, - &use_split_accumulator, sizeof(bool)); - } - /*! \brief Set number of streaming multiprocessors to use. */ void set_sm_count(int sm_count) { nvte_set_grouped_matmul_config_attribute(config_, kNVTEGroupedMatmulConfigSMCount, From ae854151137f41571fbb8c921d627ed96dd0b301 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 30 Dec 2025 11:16:29 +0000 Subject: [PATCH 33/98] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- transformer_engine/common/gemm/config.h | 4 ++-- .../common/gemm/cublaslt_grouped_gemm.cu | 10 ++++++---- .../common/include/transformer_engine/gemm.h | 4 ++-- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/transformer_engine/common/gemm/config.h b/transformer_engine/common/gemm/config.h index fd9b1266e6..6f75a34b37 100644 --- a/transformer_engine/common/gemm/config.h +++ b/transformer_engine/common/gemm/config.h @@ -7,10 +7,10 @@ #ifndef TRANSFORMER_ENGINE_GEMM_CONFIG_H_ #define TRANSFORMER_ENGINE_GEMM_CONFIG_H_ -#include - #include +#include + namespace transformer_engine { struct MatmulConfig { diff --git a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu index 98c78a304d..20c3e5222a 100644 --- a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu @@ -319,8 +319,10 @@ inline void init_matrix_layouts(cublasLtMatrixLayoutOpaque_t &descA, int *cols_B = B_sel.use_columnwise ? ws.K : (B_sel.trans ? ws.K : ws.N); int *ldb_storage = rows_B; - NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descA, A_type, num_tensors, rows_A, cols_A, lda_storage)); - NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descB, B_type, num_tensors, rows_B, cols_B, ldb_storage)); + NVTE_CHECK_CUBLAS( + cublasLtGroupedMatrixLayoutInit(&descA, A_type, num_tensors, rows_A, cols_A, lda_storage)); + NVTE_CHECK_CUBLAS( + cublasLtGroupedMatrixLayoutInit(&descB, B_type, num_tensors, rows_B, cols_B, ldb_storage)); NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descC, D_type, num_tensors, ws.M, ws.N, ws.M)); NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descD, D_type, num_tensors, ws.M, ws.N, ws.M)); } @@ -578,8 +580,8 @@ void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, const NVT const bool is_fp8 = is_fp8_dtype(A_sel.dtype) || is_fp8_dtype(B_sel.dtype); if (is_fp8) { int8_t fastAccuMode = 1; // Always use fast accumulator - NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - &matmulDesc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fastAccuMode, sizeof(fastAccuMode))); + NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(&matmulDesc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, + &fastAccuMode, sizeof(fastAccuMode))); } // Compute average dimensions for heuristics diff --git a/transformer_engine/common/include/transformer_engine/gemm.h b/transformer_engine/common/include/transformer_engine/gemm.h index 1311021185..1971714621 100644 --- a/transformer_engine/common/include/transformer_engine/gemm.h +++ b/transformer_engine/common/include/transformer_engine/gemm.h @@ -487,8 +487,8 @@ class GroupedMatmulConfigWrapper { /*! \brief Set number of streaming multiprocessors to use. */ void set_sm_count(int sm_count) { - nvte_set_grouped_matmul_config_attribute(config_, kNVTEGroupedMatmulConfigSMCount, - &sm_count, sizeof(int)); + nvte_set_grouped_matmul_config_attribute(config_, kNVTEGroupedMatmulConfigSMCount, &sm_count, + sizeof(int)); } private: From f1fc31c5d043f9b4224d0a6e95e0e55335788383 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Mon, 5 Jan 2026 08:59:34 -0800 Subject: [PATCH 34/98] wip --- .../jax/csrc/extensions/gemm.cpp | 22 +++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index 6566ff1689..79418c138e 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -768,10 +768,24 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type NVTE_CHECK_CUDA(cudaMemsetAsync(dptr, 0, count, stream_i)); } - nvte_multi_tensor_gemm(rhs_list.data(), lhs_list.data(), out_list.data(), bias_list.data(), - pre_gelu_list.data(), num_non_empty_gemms, rhs_is_trans, lhs_is_trans, - grad, workspace_list.data(), accumulate, use_split_accumulator, - num_math_sm, stream); + // nvte_multi_tensor_gemm(rhs_list.data(), lhs_list.data(), out_list.data(), bias_list.data(), + // pre_gelu_list.data(), num_non_empty_gemms, rhs_is_trans, lhs_is_trans, + // grad, workspace_list.data(), accumulate, use_split_accumulator, + // num_math_sm, stream); + int64_t avg_m = 0, avg_n = 0, avg_k = 0; + nvte_grouped_gemm( + rhs_is_trans, lhs_is_trans, + alpha, + rhs_list, lhs_list, + beta, + C, + out_list, + workspace_setup, + workspace_cublas, + stream, + &avg_m, + &avg_n, + &avg_k); return ffi_with_cuda_error_check(); } From 43f7e60ecf449413e8fcfe77f4b09bb708c16f51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Gadzi=C5=84ski?= <62263673+pggPL@users.noreply.github.com> Date: Wed, 7 Jan 2026 16:22:41 +0100 Subject: [PATCH 35/98] Update transformer_engine/common/gemm/config.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Przemyslaw Tredak Signed-off-by: Paweł Gadziński <62263673+pggPL@users.noreply.github.com> --- transformer_engine/common/gemm/config.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/transformer_engine/common/gemm/config.h b/transformer_engine/common/gemm/config.h index 6f75a34b37..56c5db16c9 100644 --- a/transformer_engine/common/gemm/config.h +++ b/transformer_engine/common/gemm/config.h @@ -49,10 +49,10 @@ struct GroupedMatmulConfig { bool avg_k_set = false; static constexpr size_t attr_sizes[] = { - sizeof(int64_t), // avg_m - sizeof(int64_t), // avg_n - sizeof(int64_t), // avg_k - sizeof(int) // sm_count + sizeof(avg_m), + sizeof(avg_n), + sizeof(avg_k), + sizeof(sm_count) }; }; From 30468af1570212b254718e2cc25ea8ed64d0b9b1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 7 Jan 2026 15:23:47 +0000 Subject: [PATCH 36/98] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- transformer_engine/common/gemm/config.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/transformer_engine/common/gemm/config.h b/transformer_engine/common/gemm/config.h index 56c5db16c9..2723bf2d30 100644 --- a/transformer_engine/common/gemm/config.h +++ b/transformer_engine/common/gemm/config.h @@ -48,12 +48,8 @@ struct GroupedMatmulConfig { bool avg_n_set = false; bool avg_k_set = false; - static constexpr size_t attr_sizes[] = { - sizeof(avg_m), - sizeof(avg_n), - sizeof(avg_k), - sizeof(sm_count) - }; + static constexpr size_t attr_sizes[] = {sizeof(avg_m), sizeof(avg_n), sizeof(avg_k), + sizeof(sm_count)}; }; } // namespace transformer_engine From 2ccaee5699af2df5cc1f60174578db8a071a3d41 Mon Sep 17 00:00:00 2001 From: Pawel Gadzinski Date: Wed, 7 Jan 2026 16:33:28 +0100 Subject: [PATCH 37/98] changed Signed-off-by: Pawel Gadzinski --- transformer_engine/common/gemm/config.cpp | 48 ++++++++++++------- transformer_engine/common/gemm/config.h | 14 ++---- .../common/gemm/cublaslt_grouped_gemm.cu | 21 ++------ .../common/gemm/cublaslt_grouped_gemm.cuh | 17 ------- 4 files changed, 39 insertions(+), 61 deletions(-) delete mode 100644 transformer_engine/common/gemm/cublaslt_grouped_gemm.cuh diff --git a/transformer_engine/common/gemm/config.cpp b/transformer_engine/common/gemm/config.cpp index c305ce033d..9cdfb29bbd 100644 --- a/transformer_engine/common/gemm/config.cpp +++ b/transformer_engine/common/gemm/config.cpp @@ -145,15 +145,21 @@ void nvte_get_grouped_matmul_config_attribute(NVTEGroupedMatmulConfig config, NVTE_CHECK(config != nullptr, "Invalid NVTEGroupedMatmulConfig (got NULL)"); const auto &config_ = *reinterpret_cast(config); switch (attr) { - case kNVTEGroupedMatmulConfigAvgM: - std::memcpy(buf, &config_.avg_m, attr_size); + case kNVTEGroupedMatmulConfigAvgM: { + int64_t val = config_.avg_m.value_or(0); + std::memcpy(buf, &val, attr_size); break; - case kNVTEGroupedMatmulConfigAvgN: - std::memcpy(buf, &config_.avg_n, attr_size); + } + case kNVTEGroupedMatmulConfigAvgN: { + int64_t val = config_.avg_n.value_or(0); + std::memcpy(buf, &val, attr_size); break; - case kNVTEGroupedMatmulConfigAvgK: - std::memcpy(buf, &config_.avg_k, attr_size); + } + case kNVTEGroupedMatmulConfigAvgK: { + int64_t val = config_.avg_k.value_or(0); + std::memcpy(buf, &val, attr_size); break; + } case kNVTEGroupedMatmulConfigSMCount: std::memcpy(buf, &config_.sm_count, attr_size); break; @@ -180,18 +186,24 @@ void nvte_set_grouped_matmul_config_attribute(NVTEGroupedMatmulConfig config, NVTE_CHECK(config != nullptr, "Invalid NVTEGroupedMatmulConfig (got NULL)"); auto &config_ = *reinterpret_cast(config); switch (attr) { - case kNVTEGroupedMatmulConfigAvgM: - std::memcpy(&config_.avg_m, buf, attr_size); - config_.avg_m_set = true; - break; - case kNVTEGroupedMatmulConfigAvgN: - std::memcpy(&config_.avg_n, buf, attr_size); - config_.avg_n_set = true; - break; - case kNVTEGroupedMatmulConfigAvgK: - std::memcpy(&config_.avg_k, buf, attr_size); - config_.avg_k_set = true; - break; + case kNVTEGroupedMatmulConfigAvgM: { + int64_t val; + std::memcpy(&val, buf, attr_size); + config_.avg_m = val; + break; + } + case kNVTEGroupedMatmulConfigAvgN: { + int64_t val; + std::memcpy(&val, buf, attr_size); + config_.avg_n = val; + break; + } + case kNVTEGroupedMatmulConfigAvgK: { + int64_t val; + std::memcpy(&val, buf, attr_size); + config_.avg_k = val; + break; + } case kNVTEGroupedMatmulConfigSMCount: std::memcpy(&config_.sm_count, buf, attr_size); break; diff --git a/transformer_engine/common/gemm/config.h b/transformer_engine/common/gemm/config.h index 2723bf2d30..b1aaae2591 100644 --- a/transformer_engine/common/gemm/config.h +++ b/transformer_engine/common/gemm/config.h @@ -10,6 +10,7 @@ #include #include +#include namespace transformer_engine { @@ -35,19 +36,14 @@ struct MatmulConfig { struct GroupedMatmulConfig { // Average dimension hints for cuBLASLt algorithm selection heuristics. - // Value of 0 means "not set" - compute automatically from tensor shapes. - int64_t avg_m = 0; - int64_t avg_n = 0; - int64_t avg_k = 0; + // nullopt means "not set" - compute automatically from tensor shapes. + std::optional avg_m; + std::optional avg_n; + std::optional avg_k; // Number of streaming multiprocessors to use in GEMM kernel int sm_count = 0; - // Track which attributes have been explicitly set - bool avg_m_set = false; - bool avg_n_set = false; - bool avg_k_set = false; - static constexpr size_t attr_sizes[] = {sizeof(avg_m), sizeof(avg_n), sizeof(avg_k), sizeof(sm_count)}; }; diff --git a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu index 20c3e5222a..d11e2221be 100644 --- a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu @@ -17,7 +17,6 @@ #include "../util/handle_manager.h" #include "../util/logging.h" #include "./config.h" -#include "./cublaslt_grouped_gemm.cuh" namespace { @@ -573,24 +572,12 @@ void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, const NVT init_matmul_desc(matmulDesc, op_A, op_B); set_fp8_scale_pointers(matmulDesc, A_sel, B_sel); - // Set fast accumulation mode for FP8 - // Fast accumulation: 0 = split accumulator (more accurate), 1 = fast accumulator - // Note: cuBLASLt grouped GEMM API does not support configurable split accumulator, - // we always use fast accumulator for performance. - const bool is_fp8 = is_fp8_dtype(A_sel.dtype) || is_fp8_dtype(B_sel.dtype); - if (is_fp8) { - int8_t fastAccuMode = 1; // Always use fast accumulator - NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(&matmulDesc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, - &fastAccuMode, sizeof(fastAccuMode))); - } - // Compute average dimensions for heuristics // K dimension: if transa, K is A's first dim; if not, K is A's last dim - int64_t avg_m_val = config_.avg_m_set ? config_.avg_m : compute_avg_first_dim(outputD); - int64_t avg_n_val = config_.avg_n_set ? config_.avg_n : compute_avg_last_dim(outputD); - int64_t avg_k_val = config_.avg_k_set ? config_.avg_k - : (A_sel.trans ? compute_avg_first_dim(A_sel.tensor) - : compute_avg_last_dim(A_sel.tensor)); + int64_t avg_m_val = config_.avg_m.value_or(compute_avg_first_dim(outputD)); + int64_t avg_n_val = config_.avg_n.value_or(compute_avg_last_dim(outputD)); + int64_t avg_k_val = config_.avg_k.value_or(A_sel.trans ? compute_avg_first_dim(A_sel.tensor) + : compute_avg_last_dim(A_sel.tensor)); // Heuristic selection cublasLtMatmulAlgo_t algo = select_grouped_gemm_algo(handle, matmulDesc, descA, descB, descC, diff --git a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cuh b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cuh deleted file mode 100644 index a032e594d5..0000000000 --- a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cuh +++ /dev/null @@ -1,17 +0,0 @@ -/************************************************************************* - * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * See LICENSE for license information. - ************************************************************************/ - -#ifndef TRANSFORMER_ENGINE_COMMON_GEMM_CUBLASLT_GROUPED_GEMM_CUH_ -#define TRANSFORMER_ENGINE_COMMON_GEMM_CUBLASLT_GROUPED_GEMM_CUH_ - -#include -#include -#include - -// nvte_grouped_gemm is declared in transformer_engine/gemm.h -// This header is for internal use only. - -#endif // TRANSFORMER_ENGINE_COMMON_GEMM_CUBLASLT_GROUPED_GEMM_CUH_ From bd8fa3010fdad67fc6556063a3058852fe7e572e Mon Sep 17 00:00:00 2001 From: Pawel Gadzinski Date: Wed, 7 Jan 2026 17:18:25 +0100 Subject: [PATCH 38/98] suggestions Signed-off-by: Pawel Gadzinski --- tests/cpp/operator/test_grouped_gemm.cu | 223 +----------------- tests/cpp/test_common.cu | 163 +++++++++++++ tests/cpp/test_common.h | 54 +++++ .../common/gemm/cublaslt_grouped_gemm.cu | 30 +-- .../common/include/transformer_engine/gemm.h | 17 +- 5 files changed, 247 insertions(+), 240 deletions(-) diff --git a/tests/cpp/operator/test_grouped_gemm.cu b/tests/cpp/operator/test_grouped_gemm.cu index 46add9e5e1..8ff7fa75aa 100644 --- a/tests/cpp/operator/test_grouped_gemm.cu +++ b/tests/cpp/operator/test_grouped_gemm.cu @@ -41,57 +41,6 @@ enum class ShapeCase { kAllDifferent, }; -// Custom deleters for RAII -struct CudaDeleter { - void operator()(void* p) const { if (p) cudaFree(p); } -}; -struct GroupedTensorDeleter { - void operator()(NVTEGroupedTensor h) const { if (h) nvte_destroy_grouped_tensor(h); } -}; - -template -using CudaPtr = std::unique_ptr; -using GroupedTensorHandle = std::unique_ptr, GroupedTensorDeleter>; - -// Helper to allocate CUDA memory into a CudaPtr -template -CudaPtr cuda_alloc(size_t bytes) { - void* ptr = nullptr; - NVTE_CHECK_CUDA(cudaMalloc(&ptr, bytes)); - return CudaPtr(static_cast(ptr)); -} - -// Helper owning GPU buffers that back NVTEGroupedTensor. -// NVTEGroupedTensor does not own memory; data/offsets/scales -// must be allocated and freed by the test. -struct GroupedBuffers { - GroupedTensorHandle handle; - CudaPtr<> data; - CudaPtr<> scale_inv; - CudaPtr first_dims_dev; - CudaPtr last_dims_dev; - CudaPtr offsets_dev; - CudaPtr<> columnwise_data; - NVTEShape logical_shape{}; - std::vector offsets_host; - std::vector tensor_bytes; - size_t num_tensors{0}; - size_t elem_size{0}; - DType dtype{DType::kFloat32}; - NVTEScalingMode scaling_mode{NVTE_DELAYED_TENSOR_SCALING}; - - GroupedBuffers() = default; - GroupedBuffers(const GroupedBuffers&) = delete; - GroupedBuffers& operator=(const GroupedBuffers&) = delete; - GroupedBuffers(GroupedBuffers&&) = default; - GroupedBuffers& operator=(GroupedBuffers&&) = default; - ~GroupedBuffers() = default; - - // Convenience accessors for raw pointers - NVTEGroupedTensor get_handle() const { return handle.get(); } - void* get_data() const { return data.get(); } -}; - size_t grouped_setup_workspace_size(const size_t num_tensors) { const size_t ptr_bytes = num_tensors * sizeof(void*); const size_t int_bytes = num_tensors * sizeof(int); @@ -102,168 +51,6 @@ size_t grouped_setup_workspace_size(const size_t num_tensors) { return size; } -GroupedBuffers build_grouped_tensor(const std::vector& tensors, - const NVTEScalingMode scaling_mode) { - NVTE_CHECK(!tensors.empty(), "No tensors provided for grouped tensor build."); - const NVTEShape shape = tensors[0]->rowwise_shape(); - const DType dtype = tensors[0]->dtype(); - const size_t num_tensors = tensors.size(); - const size_t elem_size = typeToNumBits(dtype) / 8; - GroupedBuffers grouped; - grouped.elem_size = elem_size; - grouped.num_tensors = num_tensors; - grouped.dtype = dtype; - grouped.scaling_mode = scaling_mode; - grouped.tensor_bytes.resize(num_tensors); - grouped.offsets_host.resize(num_tensors, 0); - - std::vector first_dims(num_tensors); - std::vector last_dims(num_tensors); - for (size_t i = 0; i < num_tensors; ++i) { - const auto s = tensors[i]->rowwise_shape(); - NVTE_CHECK(s.ndim == 2, "Grouped GEMM test expects 2D tensors."); - first_dims[i] = static_cast(s.data[0]); - last_dims[i] = static_cast(s.data[1]); - grouped.tensor_bytes[i] = bytes(s, dtype); - } - - const bool same_first = std::all_of(first_dims.begin(), first_dims.end(), - [&](int64_t v) { return v == first_dims[0]; }); - const bool same_last = std::all_of(last_dims.begin(), last_dims.end(), - [&](int64_t v) { return v == last_dims[0]; }); - - std::vector offsets(num_tensors, 0); - auto random_padding = [&]() -> int64_t { - // Random padding ensuring 16-byte alignment regardless of element size - // cuBLAS requires aligned pointers for vectorized loads - static std::mt19937 gen(12345); - std::uniform_int_distribution dist(0, 3); - // Calculate elements needed for 16-byte alignment in bytes, rounded up - const size_t align_elements = - std::max(1, (16 + elem_size - 1) / elem_size); // 16 bytes / element_size - return dist(gen) * static_cast(align_elements); - }; - - auto numel = [&](size_t idx) -> int64_t { - return first_dims[idx] * last_dims[idx]; - }; - - const bool need_offsets = !same_first || !same_last; - if (need_offsets) { - offsets[0] = 0; - for (size_t i = 1; i < num_tensors; ++i) { - offsets[i] = offsets[i - 1] + numel(i - 1) + random_padding(); - } - } else { - for (size_t i = 0; i < num_tensors; ++i) { - offsets[i] = static_cast(i) * numel(0); - } - } - grouped.offsets_host = offsets; - - int64_t logical_first = 0; - int64_t logical_last = 0; - if (same_first && same_last) { - logical_first = first_dims[0] * static_cast(num_tensors); - logical_last = last_dims[0]; - } else if (same_first && !same_last) { - logical_first = first_dims[0]; - logical_last = std::accumulate(last_dims.begin(), last_dims.end(), int64_t{0}); - } else if (!same_first && same_last) { - logical_first = std::accumulate(first_dims.begin(), first_dims.end(), int64_t{0}); - logical_last = last_dims[0]; - } else { - logical_first = 1; - logical_last = 0; - for (size_t i = 0; i < num_tensors; ++i) { - logical_last += first_dims[i] * last_dims[i]; - } - } - size_t logical_data[2] = {static_cast(logical_first), - static_cast(logical_last)}; - grouped.logical_shape = nvte_make_shape(logical_data, 2); - grouped.handle.reset(nvte_create_grouped_tensor(scaling_mode, num_tensors, grouped.logical_shape)); - - const int64_t last_idx = static_cast(num_tensors - 1); - const int64_t total_elems = need_offsets - ? (offsets[last_idx] + numel(last_idx)) - : (logical_first * logical_last); - const size_t total_bytes = static_cast(total_elems) * elem_size; - - grouped.data = cuda_alloc(total_bytes); - for (size_t i = 0; i < num_tensors; ++i) { - const size_t offset_bytes = static_cast(offsets[i]) * elem_size; - NVTE_CHECK_CUDA(cudaMemcpy(static_cast(grouped.data.get()) + offset_bytes, - tensors[i]->rowwise_dptr(), - grouped.tensor_bytes[i], - cudaMemcpyDeviceToDevice)); - } - - NVTEBasicTensor data_tensor{grouped.data.get(), static_cast(dtype), grouped.logical_shape}; - NVTEGroupedTensor h = grouped.handle.get(); - nvte_set_grouped_tensor_param(&h, kNVTEGroupedRowwiseData, &data_tensor); - - const bool include_columnwise = isFp8Type(dtype) || isFp4Type(dtype); - if (include_columnwise) { - grouped.columnwise_data = cuda_alloc(total_bytes); - for (size_t i = 0; i < num_tensors; ++i) { - const size_t offset_bytes = static_cast(offsets[i]) * elem_size; - NVTE_CHECK_CUDA(cudaMemcpy(static_cast(grouped.columnwise_data.get()) + offset_bytes, - tensors[i]->columnwise_dptr(), - grouped.tensor_bytes[i], - cudaMemcpyDeviceToDevice)); - } - NVTEBasicTensor col_tensor{grouped.columnwise_data.get(), - static_cast(dtype), - grouped.logical_shape}; - nvte_set_grouped_tensor_param(&h, kNVTEGroupedColumnwiseData, &col_tensor); - } - - if (!same_first) { - grouped.first_dims_dev = cuda_alloc(num_tensors * sizeof(int64_t)); - NVTE_CHECK_CUDA(cudaMemcpy(grouped.first_dims_dev.get(), first_dims.data(), - num_tensors * sizeof(int64_t), cudaMemcpyHostToDevice)); - NVTEShape fd_shape = nvte_make_shape(&num_tensors, 1); - NVTEBasicTensor fd_tensor{grouped.first_dims_dev.get(), kNVTEInt64, fd_shape}; - nvte_set_grouped_tensor_param(&h, kNVTEGroupedFirstDims, &fd_tensor); - } - - if (!same_last) { - grouped.last_dims_dev = cuda_alloc(num_tensors * sizeof(int64_t)); - NVTE_CHECK_CUDA(cudaMemcpy(grouped.last_dims_dev.get(), last_dims.data(), - num_tensors * sizeof(int64_t), cudaMemcpyHostToDevice)); - NVTEShape ld_shape = nvte_make_shape(&num_tensors, 1); - NVTEBasicTensor ld_tensor{grouped.last_dims_dev.get(), kNVTEInt64, ld_shape}; - nvte_set_grouped_tensor_param(&h, kNVTEGroupedLastDims, &ld_tensor); - } - - if (!same_first || !same_last) { - grouped.offsets_dev = cuda_alloc(num_tensors * sizeof(int64_t)); - NVTE_CHECK_CUDA(cudaMemcpy(grouped.offsets_dev.get(), offsets.data(), - num_tensors * sizeof(int64_t), cudaMemcpyHostToDevice)); - NVTEShape off_shape = nvte_make_shape(&num_tensors, 1); - NVTEBasicTensor off_tensor{grouped.offsets_dev.get(), kNVTEInt64, off_shape}; - nvte_set_grouped_tensor_param(&h, kNVTEGroupedTensorOffsets, &off_tensor); - } - - if (isFp8Type(dtype)) { - std::vector scale_inv_cpu(num_tensors, 1.f); - for (size_t i = 0; i < num_tensors; ++i) { - tensors[i]->to_cpu(); - scale_inv_cpu[i] = tensors[i]->rowwise_cpu_scale_inv_ptr()[0]; - } - grouped.scale_inv = cuda_alloc(sizeof(float) * num_tensors); - NVTE_CHECK_CUDA(cudaMemcpy(grouped.scale_inv.get(), scale_inv_cpu.data(), - sizeof(float) * num_tensors, cudaMemcpyHostToDevice)); - NVTEShape scale_shape = nvte_make_shape(&num_tensors, 1); - NVTEBasicTensor scale_tensor{grouped.scale_inv.get(), kNVTEFloat32, scale_shape}; - nvte_set_grouped_tensor_param(&h, kNVTEGroupedRowwiseScaleInv, &scale_tensor); - nvte_set_grouped_tensor_param(&h, kNVTEGroupedColumnwiseScaleInv, &scale_tensor); - } - - return grouped; -} - Tensor make_fp8_operand(const std::string& name, const std::vector& shape) { Tensor input_fp32(name + "_fp32", shape, DType::kFloat32); fillUniform(&input_fp32); @@ -447,14 +234,14 @@ void run_grouped_gemm_case(const TestParams& params) { Tensor setup_ws("setup_ws", std::vector{setup_ws_bytes}, DType::kByte); Tensor cublas_ws("cublas_ws", std::vector{cublas_ws_bytes}, DType::kByte); - nvte_grouped_gemm(params.transa, - params.transb, - alpha_tensor.data(), - grouped_A.get_handle(), + nvte_grouped_gemm(grouped_A.get_handle(), + params.transa, grouped_B.get_handle(), - beta_tensor.data(), + params.transb, params.use_null_c ? nullptr : grouped_C->get_handle(), grouped_D.get_handle(), + alpha_tensor.data(), + beta_tensor.data(), setup_ws.data(), cublas_ws.data(), nullptr, // config (use defaults) diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu index d70eb13536..21586fc499 100644 --- a/tests/cpp/test_common.cu +++ b/tests/cpp/test_common.cu @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -1057,4 +1058,166 @@ std::array get_scale_tensor_dims(const size_t rows, return {unpadded_blocks_Y, unpadded_blocks_X, blocks_Y, blocks_X}; } +GroupedBuffers build_grouped_tensor(const std::vector& tensors, + const NVTEScalingMode scaling_mode) { + NVTE_CHECK(!tensors.empty(), "No tensors provided for grouped tensor build."); + const NVTEShape shape = tensors[0]->rowwise_shape(); + const DType dtype = tensors[0]->dtype(); + const size_t num_tensors = tensors.size(); + const size_t elem_size = typeToNumBits(dtype) / 8; + GroupedBuffers grouped; + grouped.elem_size = elem_size; + grouped.num_tensors = num_tensors; + grouped.dtype = dtype; + grouped.scaling_mode = scaling_mode; + grouped.tensor_bytes.resize(num_tensors); + grouped.offsets_host.resize(num_tensors, 0); + + std::vector first_dims(num_tensors); + std::vector last_dims(num_tensors); + for (size_t i = 0; i < num_tensors; ++i) { + const auto s = tensors[i]->rowwise_shape(); + NVTE_CHECK(s.ndim == 2, "Grouped tensor build expects 2D tensors."); + first_dims[i] = static_cast(s.data[0]); + last_dims[i] = static_cast(s.data[1]); + grouped.tensor_bytes[i] = bytes(s, dtype); + } + + const bool same_first = std::all_of(first_dims.begin(), first_dims.end(), + [&](int64_t v) { return v == first_dims[0]; }); + const bool same_last = std::all_of(last_dims.begin(), last_dims.end(), + [&](int64_t v) { return v == last_dims[0]; }); + + std::vector offsets(num_tensors, 0); + auto random_padding = [&]() -> int64_t { + // Random padding ensuring 16-byte alignment regardless of element size + // cuBLAS requires aligned pointers for vectorized loads + static std::mt19937 gen(12345); + std::uniform_int_distribution dist(0, 3); + // Calculate elements needed for 16-byte alignment in bytes, rounded up + const size_t align_elements = + std::max(1, (16 + elem_size - 1) / elem_size); // 16 bytes / element_size + return dist(gen) * static_cast(align_elements); + }; + + auto numel = [&](size_t idx) -> int64_t { + return first_dims[idx] * last_dims[idx]; + }; + + const bool need_offsets = !same_first || !same_last; + if (need_offsets) { + offsets[0] = 0; + for (size_t i = 1; i < num_tensors; ++i) { + offsets[i] = offsets[i - 1] + numel(i - 1) + random_padding(); + } + } else { + for (size_t i = 0; i < num_tensors; ++i) { + offsets[i] = static_cast(i) * numel(0); + } + } + grouped.offsets_host = offsets; + + int64_t logical_first = 0; + int64_t logical_last = 0; + if (same_first && same_last) { + logical_first = first_dims[0] * static_cast(num_tensors); + logical_last = last_dims[0]; + } else if (same_first && !same_last) { + logical_first = first_dims[0]; + logical_last = std::accumulate(last_dims.begin(), last_dims.end(), int64_t{0}); + } else if (!same_first && same_last) { + logical_first = std::accumulate(first_dims.begin(), first_dims.end(), int64_t{0}); + logical_last = last_dims[0]; + } else { + logical_first = 1; + logical_last = 0; + for (size_t i = 0; i < num_tensors; ++i) { + logical_last += first_dims[i] * last_dims[i]; + } + } + size_t logical_data[2] = {static_cast(logical_first), + static_cast(logical_last)}; + grouped.logical_shape = nvte_make_shape(logical_data, 2); + grouped.handle.reset(nvte_create_grouped_tensor(scaling_mode, num_tensors, grouped.logical_shape)); + + const int64_t last_idx = static_cast(num_tensors - 1); + const int64_t total_elems = need_offsets + ? (offsets[last_idx] + numel(last_idx)) + : (logical_first * logical_last); + const size_t total_bytes = static_cast(total_elems) * elem_size; + + grouped.data = cuda_alloc(total_bytes); + for (size_t i = 0; i < num_tensors; ++i) { + const size_t offset_bytes = static_cast(offsets[i]) * elem_size; + NVTE_CHECK_CUDA(cudaMemcpy(static_cast(grouped.data.get()) + offset_bytes, + tensors[i]->rowwise_dptr(), + grouped.tensor_bytes[i], + cudaMemcpyDeviceToDevice)); + } + + NVTEBasicTensor data_tensor{grouped.data.get(), static_cast(dtype), grouped.logical_shape}; + NVTEGroupedTensor h = grouped.handle.get(); + nvte_set_grouped_tensor_param(&h, kNVTEGroupedRowwiseData, &data_tensor); + + const bool include_columnwise = isFp8Type(dtype) || isFp4Type(dtype); + if (include_columnwise) { + grouped.columnwise_data = cuda_alloc(total_bytes); + for (size_t i = 0; i < num_tensors; ++i) { + const size_t offset_bytes = static_cast(offsets[i]) * elem_size; + NVTE_CHECK_CUDA(cudaMemcpy(static_cast(grouped.columnwise_data.get()) + offset_bytes, + tensors[i]->columnwise_dptr(), + grouped.tensor_bytes[i], + cudaMemcpyDeviceToDevice)); + } + NVTEBasicTensor col_tensor{grouped.columnwise_data.get(), + static_cast(dtype), + grouped.logical_shape}; + nvte_set_grouped_tensor_param(&h, kNVTEGroupedColumnwiseData, &col_tensor); + } + + if (!same_first) { + grouped.first_dims_dev = cuda_alloc(num_tensors * sizeof(int64_t)); + NVTE_CHECK_CUDA(cudaMemcpy(grouped.first_dims_dev.get(), first_dims.data(), + num_tensors * sizeof(int64_t), cudaMemcpyHostToDevice)); + NVTEShape fd_shape = nvte_make_shape(&num_tensors, 1); + NVTEBasicTensor fd_tensor{grouped.first_dims_dev.get(), kNVTEInt64, fd_shape}; + nvte_set_grouped_tensor_param(&h, kNVTEGroupedFirstDims, &fd_tensor); + } + + if (!same_last) { + grouped.last_dims_dev = cuda_alloc(num_tensors * sizeof(int64_t)); + NVTE_CHECK_CUDA(cudaMemcpy(grouped.last_dims_dev.get(), last_dims.data(), + num_tensors * sizeof(int64_t), cudaMemcpyHostToDevice)); + NVTEShape ld_shape = nvte_make_shape(&num_tensors, 1); + NVTEBasicTensor ld_tensor{grouped.last_dims_dev.get(), kNVTEInt64, ld_shape}; + nvte_set_grouped_tensor_param(&h, kNVTEGroupedLastDims, &ld_tensor); + } + + if (!same_first || !same_last) { + grouped.offsets_dev = cuda_alloc(num_tensors * sizeof(int64_t)); + NVTE_CHECK_CUDA(cudaMemcpy(grouped.offsets_dev.get(), offsets.data(), + num_tensors * sizeof(int64_t), cudaMemcpyHostToDevice)); + NVTEShape off_shape = nvte_make_shape(&num_tensors, 1); + NVTEBasicTensor off_tensor{grouped.offsets_dev.get(), kNVTEInt64, off_shape}; + nvte_set_grouped_tensor_param(&h, kNVTEGroupedTensorOffsets, &off_tensor); + } + + if (isFp8Type(dtype)) { + std::vector scale_inv_cpu(num_tensors, 1.f); + for (size_t i = 0; i < num_tensors; ++i) { + tensors[i]->to_cpu(); + scale_inv_cpu[i] = tensors[i]->rowwise_cpu_scale_inv_ptr()[0]; + } + grouped.scale_inv = cuda_alloc(sizeof(float) * num_tensors); + NVTE_CHECK_CUDA(cudaMemcpy(grouped.scale_inv.get(), scale_inv_cpu.data(), + sizeof(float) * num_tensors, cudaMemcpyHostToDevice)); + NVTEShape scale_shape = nvte_make_shape(&num_tensors, 1); + NVTEBasicTensor scale_tensor{grouped.scale_inv.get(), kNVTEFloat32, scale_shape}; + nvte_set_grouped_tensor_param(&h, kNVTEGroupedRowwiseScaleInv, &scale_tensor); + nvte_set_grouped_tensor_param(&h, kNVTEGroupedColumnwiseScaleInv, &scale_tensor); + } + + return grouped; +} + } // namespace test diff --git a/tests/cpp/test_common.h b/tests/cpp/test_common.h index b8993dfb62..106c336405 100644 --- a/tests/cpp/test_common.h +++ b/tests/cpp/test_common.h @@ -500,6 +500,60 @@ int32_t getDeviceComputeCapability(); constexpr int32_t hopperComputeCapability = 90; constexpr int32_t blackwellComputeCapability = 100; +// Custom deleters for RAII +struct CudaDeleter { + void operator()(void* p) const { if (p) cudaFree(p); } +}; +struct GroupedTensorDeleter { + void operator()(NVTEGroupedTensor h) const { if (h) nvte_destroy_grouped_tensor(h); } +}; + +template +using CudaPtr = std::unique_ptr; +using GroupedTensorHandle = std::unique_ptr, GroupedTensorDeleter>; + +// Helper to allocate CUDA memory into a CudaPtr +template +CudaPtr cuda_alloc(size_t bytes) { + void* ptr = nullptr; + NVTE_CHECK_CUDA(cudaMalloc(&ptr, bytes)); + return CudaPtr(static_cast(ptr)); +} + +// Helper owning GPU buffers that back NVTEGroupedTensor. +// NVTEGroupedTensor does not own memory; data/offsets/scales +// must be allocated and freed by the test. +struct GroupedBuffers { + GroupedTensorHandle handle; + CudaPtr<> data; + CudaPtr<> scale_inv; + CudaPtr first_dims_dev; + CudaPtr last_dims_dev; + CudaPtr offsets_dev; + CudaPtr<> columnwise_data; + NVTEShape logical_shape{}; + std::vector offsets_host; + std::vector tensor_bytes; + size_t num_tensors{0}; + size_t elem_size{0}; + DType dtype{DType::kFloat32}; + NVTEScalingMode scaling_mode{NVTE_DELAYED_TENSOR_SCALING}; + + GroupedBuffers() = default; + GroupedBuffers(const GroupedBuffers&) = delete; + GroupedBuffers& operator=(const GroupedBuffers&) = delete; + GroupedBuffers(GroupedBuffers&&) = default; + GroupedBuffers& operator=(GroupedBuffers&&) = default; + ~GroupedBuffers() = default; + + // Convenience accessors for raw pointers + NVTEGroupedTensor get_handle() const { return handle.get(); } + void* get_data() const { return data.get(); } +}; + +GroupedBuffers build_grouped_tensor(const std::vector& tensors, + const NVTEScalingMode scaling_mode); + } // namespace test #if FP4_TYPE_SUPPORTED diff --git a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu index d11e2221be..5638dc772f 100644 --- a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu @@ -62,7 +62,7 @@ struct TensorShapeInfo { } // Create for C tensor (uses D's dimensions, only has offsets) - static TensorShapeInfo for_C(const transformer_engine::GroupedTensor *C, + static TensorShapeInfo create_shape_info_for_C(const transformer_engine::GroupedTensor *C, const transformer_engine::GroupedTensor *D) { const bool has_first = D->first_dims.has_data(); const bool has_last = D->last_dims.has_data(); @@ -166,16 +166,16 @@ inline void validate_grouped_gemm_inputs(const transformer_engine::GroupedTensor const transformer_engine::Tensor *alpha_tensor, const transformer_engine::Tensor *beta_tensor) { const size_t num_tensors = inputA->num_tensors; - NVTE_CHECK(num_tensors >= 1, "Grouped GEMM: num_tensors must be at least 1"); + NVTE_CHECK(num_tensors >= 1, "Grouped GEMM: number of tensors must be at least 1"); NVTE_CHECK(inputB->num_tensors == num_tensors, - "Grouped GEMM: A and B must have the same num_tensors"); + "Grouped GEMM: A and B must have the same number of tensors"); // C can be NULL (will use D as C when beta=0) if (inputC != nullptr) { NVTE_CHECK(inputC->num_tensors == num_tensors, - "Grouped GEMM: A and C must have the same num_tensors"); + "Grouped GEMM: A and C must have the same number of tensors"); } NVTE_CHECK(outputD->num_tensors == num_tensors, - "Grouped GEMM: A and D must have the same num_tensors"); + "Grouped GEMM: A and D must have the same number of tensors"); // Validate alpha/beta have per-matrix values const size_t alpha_numel = alpha_tensor->data.numel(); @@ -471,7 +471,7 @@ inline void launch_grouped_gemm_setup( const transformer_engine::Tensor *beta_tensor, size_t num_tensors, cudaStream_t stream) { TensorShapeInfo A_meta = TensorShapeInfo::from_tensor(A_sel.tensor); TensorShapeInfo B_meta = TensorShapeInfo::from_tensor(B_sel.tensor); - TensorShapeInfo C_meta = TensorShapeInfo::for_C(C, D); + TensorShapeInfo C_meta = TensorShapeInfo::create_shape_info_for_C(C, D); TensorShapeInfo D_meta = TensorShapeInfo::from_tensor(D); const char *c_base = static_cast(C->data.dptr); @@ -500,10 +500,11 @@ inline size_t grouped_gemm_setup_workspace_size(size_t num_tensors) { } // namespace -void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, const NVTEGroupedTensor A, - const NVTEGroupedTensor B, const NVTETensor beta, const NVTEGroupedTensor C, - NVTEGroupedTensor D, NVTETensor workspace_setup, NVTETensor workspace_cublas, - NVTEGroupedMatmulConfig config, cudaStream_t stream) { +void nvte_grouped_gemm(const NVTEGroupedTensor A, int transa, const NVTEGroupedTensor B, int transb, + const NVTEGroupedTensor C, NVTEGroupedTensor D, const NVTETensor alpha, + const NVTETensor beta, NVTETensor workspace_setup, + NVTETensor workspace_cublas, NVTEGroupedMatmulConfig config, + cudaStream_t stream) { NVTE_API_CALL(nvte_grouped_gemm); using namespace transformer_engine; @@ -593,10 +594,11 @@ void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, const NVT #else // CUBLAS_VERSION < 130100 -void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, const NVTEGroupedTensor A, - const NVTEGroupedTensor B, const NVTETensor beta, const NVTEGroupedTensor C, - NVTEGroupedTensor D, NVTETensor workspace_setup, NVTETensor workspace_cublas, - NVTEGroupedMatmulConfig config, cudaStream_t stream) { +void nvte_grouped_gemm(const NVTEGroupedTensor A, int transa, const NVTEGroupedTensor B, int transb, + const NVTEGroupedTensor C, NVTEGroupedTensor D, const NVTETensor alpha, + const NVTETensor beta, NVTETensor workspace_setup, + NVTETensor workspace_cublas, NVTEGroupedMatmulConfig config, + cudaStream_t stream) { NVTE_ERROR("nvte_grouped_gemm requires cuBLAS 13.1+, but compile-time cuBLAS version is ", CUBLAS_VERSION, ". Please upgrade to CUDA 13.1 or newer."); } diff --git a/transformer_engine/common/include/transformer_engine/gemm.h b/transformer_engine/common/include/transformer_engine/gemm.h index 1971714621..cc12fb1c6b 100644 --- a/transformer_engine/common/include/transformer_engine/gemm.h +++ b/transformer_engine/common/include/transformer_engine/gemm.h @@ -308,14 +308,14 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor * Uses NVTEGroupedTensor to efficiently handle collections of tensors with contiguous * memory layout and shape metadata. * - * \param[in] transa Whether to transpose A matrices. - * \param[in] transb Whether to transpose B matrices. - * \param[in] alpha Scale multipliers for A @ B (NVTETensor with num_tensors elements). * \param[in] A Input grouped tensor A. + * \param[in] transa Whether to transpose A matrices. * \param[in] B Input grouped tensor B. - * \param[in] beta Scale multipliers for C (NVTETensor with num_tensors elements). + * \param[in] transb Whether to transpose B matrices. * \param[in] C Input grouped tensor C (can be NULL for beta=0). * \param[out] D Output grouped tensor D. + * \param[in] alpha Scale multipliers for A @ B (NVTETensor with num_tensors elements). + * \param[in] beta Scale multipliers for C (NVTETensor with num_tensors elements). * \param[in] workspace_setup Workspace tensor for pointer array setup. * \param[in] workspace_cublas Workspace tensor for cuBLAS operations. * \param[in] config Additional configuration (can be NULL for defaults). @@ -329,10 +329,11 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor * - Shape compatibility: if transa=false, transb=false: * - A[i]: (M[i], K[i]), B[i]: (K[i], N[i]), D[i]: (M[i], N[i]) */ -void nvte_grouped_gemm(int transa, int transb, const NVTETensor alpha, const NVTEGroupedTensor A, - const NVTEGroupedTensor B, const NVTETensor beta, const NVTEGroupedTensor C, - NVTEGroupedTensor D, NVTETensor workspace_setup, NVTETensor workspace_cublas, - NVTEGroupedMatmulConfig config, cudaStream_t stream); +void nvte_grouped_gemm(const NVTEGroupedTensor A, int transa, const NVTEGroupedTensor B, int transb, + const NVTEGroupedTensor C, NVTEGroupedTensor D, const NVTETensor alpha, + const NVTETensor beta, NVTETensor workspace_setup, + NVTETensor workspace_cublas, NVTEGroupedMatmulConfig config, + cudaStream_t stream); #ifdef __cplusplus } // extern "C" From f0df80e63b8a3cc60668da6c7124c2a4d5af6ae0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 7 Jan 2026 16:19:17 +0000 Subject: [PATCH 39/98] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- transformer_engine/common/gemm/cublaslt_grouped_gemm.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu index 5638dc772f..3861ebf857 100644 --- a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu @@ -63,7 +63,7 @@ struct TensorShapeInfo { // Create for C tensor (uses D's dimensions, only has offsets) static TensorShapeInfo create_shape_info_for_C(const transformer_engine::GroupedTensor *C, - const transformer_engine::GroupedTensor *D) { + const transformer_engine::GroupedTensor *D) { const bool has_first = D->first_dims.has_data(); const bool has_last = D->last_dims.has_data(); NVTE_CHECK(has_first || D->all_same_first_dim(), From 301874d31dc5d5cfee6d4e5cbaf1037161354222 Mon Sep 17 00:00:00 2001 From: Pawel Gadzinski Date: Wed, 7 Jan 2026 17:23:23 +0100 Subject: [PATCH 40/98] fix Signed-off-by: Pawel Gadzinski --- tests/cpp/operator/test_grouped_gemm.cu | 2 +- tests/cpp/test_common.cu | 2 +- tests/cpp/test_common.h | 2 +- transformer_engine/common/gemm/cublaslt_grouped_gemm.cu | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/cpp/operator/test_grouped_gemm.cu b/tests/cpp/operator/test_grouped_gemm.cu index 8ff7fa75aa..90d89c77c8 100644 --- a/tests/cpp/operator/test_grouped_gemm.cu +++ b/tests/cpp/operator/test_grouped_gemm.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See LICENSE for license information. ************************************************************************/ diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu index 21586fc499..af99d9c42f 100644 --- a/tests/cpp/test_common.cu +++ b/tests/cpp/test_common.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See LICENSE for license information. ************************************************************************/ diff --git a/tests/cpp/test_common.h b/tests/cpp/test_common.h index 106c336405..ac9f377ef4 100644 --- a/tests/cpp/test_common.h +++ b/tests/cpp/test_common.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See LICENSE for license information. ************************************************************************/ diff --git a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu index 3861ebf857..0d376c2e56 100644 --- a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See LICENSE for license information. ************************************************************************/ From c8cf7633aa29fbb93a05d7b70475ff1366fc43f0 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Wed, 7 Jan 2026 11:22:48 -0800 Subject: [PATCH 41/98] with many hacks grouped gemm with new api works for a particular hardcoded shape --- transformer_engine/jax/cpp_extensions/gemm.py | 15 ++- .../jax/csrc/extensions/gemm.cpp | 105 ++++++++++++++---- 2 files changed, 96 insertions(+), 24 deletions(-) diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py index 28100c9715..38d21f26ec 100644 --- a/transformer_engine/jax/cpp_extensions/gemm.py +++ b/transformer_engine/jax/cpp_extensions/gemm.py @@ -1463,7 +1463,7 @@ class GroupedGemmPrimitive(BasePrimitive): name = "te_grouped_gemm_ffi" multiple_results = True - impl_static_args = (7, 8, 9, 10, 11, 12, 13, 14, 15, 16) + impl_static_args = (9, 10, 11, 12, 13, 14, 15, 16, 17, 18) inner_primitive = None outer_primitive = None @@ -1476,6 +1476,8 @@ def abstract( bias_aval, group_sizes_aval, group_offset_aval, + alpha, + beta, *, M, N, @@ -1535,6 +1537,8 @@ def abstract( # We also pad scale_inv swizzle buffers size for 256 bytes alignment. workspace_size += lhs_scale_inv_aval.size + mxfp8_scaling_sinv_alignment_padding workspace_size += rhs_scale_inv_aval.size + mxfp8_scaling_sinv_alignment_padding + + workspace_size += 1024*1024 # HACK: properly make a workspace_setup buffer in addition to the workspace_cublas buffer workspace_aval = jax.core.ShapedArray(shape=(workspace_size,), dtype=jnp.uint8) out_shape = (M, N) @@ -1587,6 +1591,8 @@ def impl( bias, group_sizes, group_offset, + alpha, + beta, M, N, K, @@ -1607,6 +1613,8 @@ def impl( bias, group_sizes, group_offset, + alpha, + beta, M=M, N=N, K=K, @@ -2115,6 +2123,9 @@ def grouped_gemm( assert not has_bias or bias.shape == (group_sizes.size, N) bias = jnp.empty((), jnp.float32) if bias is None else bias + num_gemms = group_sizes.shape[0] + alpha = jnp.ones((num_gemms,), jnp.float32) + beta = jnp.zeros((num_gemms,), jnp.float32) (out,) = GroupedGemmPrimitive.outer_primitive.bind( lhs_data, lhs_scale_inv, @@ -2123,6 +2134,8 @@ def grouped_gemm( bias, group_sizes, group_offset, + alpha, + beta, M=M, N=N, K=K_lhs, diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index 79418c138e..7c2d4c81e6 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -399,10 +399,62 @@ XLA_FFI_DEFINE_HANDLER_SYMBOL(GroupedGemmD2HGroupSizesHandler, GroupedGemmD2HGro .Ret() // dummy_output .Attr("num_gemms")); +NVTEGroupedTensor make_grouped_tensor(Buffer_Type const& data, std::optional scale_inv, JAXX_Scaling_Mode scaling_mode, size_t num_tensors) { + printf("make_grouped_tensor data shape: "); + for (auto dim : data.dimensions()) { + printf("%zu, ", dim); + } + printf("\n"); + NVTEShape logical_shape{}; + if (data.dimensions().size() == 1) { + // HACK + size_t cdim_size = 4096; + logical_shape.ndim = 2; + logical_shape.data[0] = data.dimensions()[0] / cdim_size; + logical_shape.data[1] = cdim_size; + } + else { + NVTE_CHECK(data.dimensions().size() == 2, "Expected 2D tensor for GEMM operand but received ndim=", data.dimensions().size()); + + logical_shape.ndim = 2; + logical_shape.data[0] = data.dimensions()[0]; + logical_shape.data[1] = data.dimensions()[1]; + } + + NVTEGroupedTensor grouped_tensor = nvte_create_grouped_tensor(get_nvte_scaling_mode(scaling_mode), num_tensors, logical_shape); + + NVTEBasicTensor data_tensor{reinterpret_cast(data.untyped_data()), + static_cast(convert_ffi_datatype_to_te_dtype(data.element_type())), + logical_shape}; + nvte_set_grouped_tensor_param(&grouped_tensor, kNVTEGroupedRowwiseData, &data_tensor); + + if (scale_inv.has_value()) { + NVTEShape logical_scale_shape{}; + if (scale_inv->dimensions().size() == 1) { + logical_scale_shape.ndim = 1; + logical_scale_shape.data[0] = scale_inv->dimensions()[0]; + } else if (scale_inv->dimensions().size() == 2) { + logical_scale_shape.ndim = 2; + logical_scale_shape.data[0] = scale_inv->dimensions()[0]; + logical_scale_shape.data[1] = scale_inv->dimensions()[1]; + } else { + NVTE_CHECK(false, "Expected 1D or 2D tensor for GEMM scale_inv but received ndim=", scale_inv->dimensions().size()); + } + NVTEBasicTensor scale_inv_tensor{reinterpret_cast(scale_inv->untyped_data()), + static_cast(convert_ffi_datatype_to_te_dtype(scale_inv->element_type())), + logical_scale_shape}; + nvte_set_grouped_tensor_param(&grouped_tensor, kNVTEGroupedRowwiseScaleInv, &scale_inv_tensor); + } + + return grouped_tensor; +} + Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type lhs_sinv, Buffer_Type rhs_data, Buffer_Type rhs_sinv, Buffer_Type bias, - Buffer_Type group_sizes, Buffer_Type group_offset, Result_Type output, - Result_Type workspace, size_t m, size_t n, size_t k, bool lhs_is_trans, + Buffer_Type group_sizes, Buffer_Type group_offset, + Buffer_Type alpha, Buffer_Type beta, + Result_Type output, Result_Type workspace, + size_t m, size_t n, size_t k, bool lhs_is_trans, bool rhs_is_trans, JAXX_Scaling_Mode scaling_mode, bool has_bias, bool is_grouped_dense_wgrad, bool use_async_d2h_group_sizes) { // Notes on matrix layouts and transpose: @@ -577,7 +629,6 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type std::vector bias_list; std::vector pre_gelu_list; std::vector out_list; - std::vector workspace_list; size_t lhs_sinv_total_size = 0; size_t rhs_sinv_total_size = 0; @@ -724,15 +775,6 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type out_list.push_back(out_wrapper_list.back().data()); } - auto workspace_shape = std::vector{workspace_size}; - for (int i = 0; i < num_streams; i++) { - auto workspace_i = - TensorWrapper(static_cast(workspace_ptr), workspace_shape, DType::kByte); - workspace_wrapper_list.push_back(std::move(workspace_i)); - workspace_list.push_back(workspace_wrapper_list.back().data()); - workspace_ptr += workspace_size; - } - if (is_fp8_gemm) { if (is_tensor_scaling) { lhs_sinv_size *= tensor_scaling_sinv_aligment; @@ -772,20 +814,35 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type // pre_gelu_list.data(), num_non_empty_gemms, rhs_is_trans, lhs_is_trans, // grad, workspace_list.data(), accumulate, use_split_accumulator, // num_math_sm, stream); - int64_t avg_m = 0, avg_n = 0, avg_k = 0; + + constexpr size_t workspace_setup_size = 1024 * 1024; // HACK: dummy workspace for setup + TensorWrapper workspace_setup(workspace_ptr, + std::vector{workspace_setup_size}, DType::kByte); + TensorWrapper workspace_cublas(workspace_ptr + workspace_setup_size, + std::vector{workspace_size}, DType::kByte); + + TensorWrapper alpha_tensor(static_cast(alpha.untyped_data()), std::vector{num_gemms}, + convert_ffi_datatype_to_te_dtype(alpha.element_type())); + TensorWrapper beta_tensor(static_cast(beta.untyped_data()), std::vector{num_gemms}, + convert_ffi_datatype_to_te_dtype(beta.element_type())); + + NVTEGroupedTensor rhs_tensor = make_grouped_tensor(rhs_data, rhs_sinv, scaling_mode, num_gemms); + NVTEGroupedTensor lhs_tensor = make_grouped_tensor(lhs_data, lhs_sinv, scaling_mode, num_gemms); + NVTEGroupedTensor out_tensor = make_grouped_tensor(*output, std::nullopt, JAXX_Scaling_Mode::NO_SCALING, num_gemms); + nvte_grouped_gemm( rhs_is_trans, lhs_is_trans, - alpha, - rhs_list, lhs_list, - beta, - C, - out_list, - workspace_setup, - workspace_cublas, + alpha_tensor.data(), + rhs_tensor, lhs_tensor, + beta_tensor.data(), + nullptr, + out_tensor, + workspace_setup.data(), + workspace_cublas.data(), stream, - &avg_m, - &avg_n, - &avg_k); + nullptr, + nullptr, + nullptr); return ffi_with_cuda_error_check(); } @@ -800,6 +857,8 @@ XLA_FFI_DEFINE_HANDLER_SYMBOL(GroupedGemmHandler, GroupedGemmFFI, .Arg() // bias .Arg() // group_sizes .Arg() // group_offset + .Arg() // alpha + .Arg() // beta .Ret() // output .Ret() // workspace .Attr("M") From 21e7002991831ecd933388f4ad95a53d0d64d69b Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Wed, 7 Jan 2026 11:59:53 -0800 Subject: [PATCH 42/98] progress --- transformer_engine/jax/cpp_extensions/gemm.py | 7 +++ .../jax/csrc/extensions/gemm.cpp | 60 ++++++++++--------- 2 files changed, 40 insertions(+), 27 deletions(-) diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py index 38d21f26ec..25f3315ba7 100644 --- a/transformer_engine/jax/cpp_extensions/gemm.py +++ b/transformer_engine/jax/cpp_extensions/gemm.py @@ -2123,6 +2123,13 @@ def grouped_gemm( assert not has_bias or bias.shape == (group_sizes.size, N) bias = jnp.empty((), jnp.float32) if bias is None else bias + print(f"{lhs_data.shape=}, {rhs_data.shape=}, {group_sizes.shape=}") + print(f"{M=}, {N=}, {K_lhs=}, {K_rhs=}") + # import pdb; pdb.set_trace() + # import pdb; pdb.set_trace() + # print(f"{lhs_is_trans=}, {rhs_is_trans=}") + # import pdb; pdb.set_trace() + num_gemms = group_sizes.shape[0] alpha = jnp.ones((num_gemms,), jnp.float32) beta = jnp.zeros((num_gemms,), jnp.float32) diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index 7c2d4c81e6..9543c66356 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -399,33 +399,34 @@ XLA_FFI_DEFINE_HANDLER_SYMBOL(GroupedGemmD2HGroupSizesHandler, GroupedGemmD2HGro .Ret() // dummy_output .Attr("num_gemms")); -NVTEGroupedTensor make_grouped_tensor(Buffer_Type const& data, std::optional scale_inv, JAXX_Scaling_Mode scaling_mode, size_t num_tensors) { - printf("make_grouped_tensor data shape: "); - for (auto dim : data.dimensions()) { - printf("%zu, ", dim); - } - printf("\n"); - NVTEShape logical_shape{}; - if (data.dimensions().size() == 1) { - // HACK - size_t cdim_size = 4096; - logical_shape.ndim = 2; - logical_shape.data[0] = data.dimensions()[0] / cdim_size; - logical_shape.data[1] = cdim_size; - } - else { - NVTE_CHECK(data.dimensions().size() == 2, "Expected 2D tensor for GEMM operand but received ndim=", data.dimensions().size()); - - logical_shape.ndim = 2; - logical_shape.data[0] = data.dimensions()[0]; - logical_shape.data[1] = data.dimensions()[1]; - } - - NVTEGroupedTensor grouped_tensor = nvte_create_grouped_tensor(get_nvte_scaling_mode(scaling_mode), num_tensors, logical_shape); +NVTEGroupedTensor make_grouped_tensor(Buffer_Type const& data, std::optional scale_inv, JAXX_Scaling_Mode scaling_mode, size_t num_tensors, NVTEShape const& dataShape) { + // printf("make_grouped_tensor data shape: "); + // for (auto dim : data.dimensions()) { + // printf("%zu, ", dim); + // } + // printf("\n"); + // NVTEShape logical_shape{}; + // if (data.dimensions().size() == 1) { + // // HACK + // size_t cdim_size = 4096; + // logical_shape.ndim = 2; + // logical_shape.data[0] = data.dimensions()[0] / cdim_size; + // logical_shape.data[1] = cdim_size; + // printf("NUM TENSORS: %zu\n", num_tensors); + // } + // else { + // NVTE_CHECK(data.dimensions().size() == 2, "Expected 2D tensor for GEMM operand but received ndim=", data.dimensions().size()); + + // logical_shape.ndim = 2; + // logical_shape.data[0] = data.dimensions()[0]; + // logical_shape.data[1] = data.dimensions()[1]; + // } + + NVTEGroupedTensor grouped_tensor = nvte_create_grouped_tensor(get_nvte_scaling_mode(scaling_mode), num_tensors, dataShape); NVTEBasicTensor data_tensor{reinterpret_cast(data.untyped_data()), static_cast(convert_ffi_datatype_to_te_dtype(data.element_type())), - logical_shape}; + dataShape}; nvte_set_grouped_tensor_param(&grouped_tensor, kNVTEGroupedRowwiseData, &data_tensor); if (scale_inv.has_value()) { @@ -826,9 +827,14 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type TensorWrapper beta_tensor(static_cast(beta.untyped_data()), std::vector{num_gemms}, convert_ffi_datatype_to_te_dtype(beta.element_type())); - NVTEGroupedTensor rhs_tensor = make_grouped_tensor(rhs_data, rhs_sinv, scaling_mode, num_gemms); - NVTEGroupedTensor lhs_tensor = make_grouped_tensor(lhs_data, lhs_sinv, scaling_mode, num_gemms); - NVTEGroupedTensor out_tensor = make_grouped_tensor(*output, std::nullopt, JAXX_Scaling_Mode::NO_SCALING, num_gemms); + NVTEShape rhsShape{.data={num_gemms * k, n}, .ndim=2}; + NVTEGroupedTensor rhs_tensor = make_grouped_tensor(rhs_data, rhs_sinv, scaling_mode, num_gemms, rhsShape); + NVTEShape lhsShape{.data={m, k}, .ndim=2}; + NVTEGroupedTensor lhs_tensor = make_grouped_tensor(lhs_data, lhs_sinv, scaling_mode, num_gemms, lhsShape); + NVTEShape outShape{.data={m, n}, .ndim=2}; + NVTEGroupedTensor out_tensor = make_grouped_tensor(*output, std::nullopt, JAXX_Scaling_Mode::NO_SCALING, num_gemms, outShape); + + NVTE_CHECK(!rhs_is_trans && !lhs_is_trans, "TE grouped GEMM only supports non-transposed inputs but received rhs_is_trans=", rhs_is_trans, " lhs_is_trans=", lhs_is_trans); nvte_grouped_gemm( rhs_is_trans, lhs_is_trans, From 1ae08ddd7dfde42a9c2fea90128f19a74f9a191c Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Wed, 7 Jan 2026 13:46:24 -0800 Subject: [PATCH 43/98] more tests pass --- test_einsum.py | 74 +++++++++++++++++++ .../jax/csrc/extensions/gemm.cpp | 16 +++- 2 files changed, 88 insertions(+), 2 deletions(-) create mode 100644 test_einsum.py diff --git a/test_einsum.py b/test_einsum.py new file mode 100644 index 0000000000..5bb05403f2 --- /dev/null +++ b/test_einsum.py @@ -0,0 +1,74 @@ +from enum import Enum + +import jax +import jax.numpy as jnp +import numpy as np +import transformer_engine.jax as te +from transformer_engine.common.recipe import Recipe, Float8CurrentScaling, MXFP8BlockScaling, DelayedScaling, NVFP4BlockScaling +from flax import linen as nn + +def make_einsum_cls(quantization_recipe): + def te_einsum(generate_quantizer_set, s, x, kernel, **kwargs): + def dot_general(x, kernel, dims, *args, **kwargs): + contracting_dims, batch_dims = dims + assert batch_dims == ((), ()), "Batch dims not supported in TE/JAX yet" + + quantizer_set = generate_quantizer_set("quantizer_set_for_einsum") + return te.dense.dense( + x, + kernel, + contracting_dims=contracting_dims, + quantizer_set=quantizer_set, + ) + return jnp.einsum(s, x, kernel, _dot_general=dot_general, **kwargs) + + return te.flax.wrap_function_in_te_state_module(te_einsum, quantization_recipe, "einsum")() + +class EinsumType(Enum): + JAX = 'jax' + TE = 'te' + +def main(): + + class SimpleModel(nn.Module): + + einsum_type: EinsumType + quantization_recipe: Recipe = None + + def _einsum(self, *args, **kwargs): + if self.einsum_type == EinsumType.JAX: + return jnp.einsum(*args, **kwargs) + elif self.einsum_type == EinsumType.TE: + # It is important that we call make_einsum_cls(recipe) here each time einsum + # is called. If we were to call make_einsum_cls only once and re-use it, the state for some recipes such as DelayedScaling would become incorrectly shared instead of each call having its own state. + return make_einsum_cls(self.quantization_recipe)(*args, **kwargs) + else: + raise ValueError(f"Unsupported einsum type: {self.einsum_type}") + + @nn.compact + def __call__(self, x): + kernel = self.param('kernel', jax.nn.initializers.lecun_normal(), (32, 32), jnp.bfloat16) + return self._einsum("ij,jk->ik", x, kernel) + + + def test_model(einsum_type: EinsumType, quantization_recipe: Recipe = None): + model = SimpleModel(einsum_type=einsum_type, quantization_recipe=quantization_recipe) + x = jax.random.uniform(jax.random.PRNGKey(2), (32, 32), jnp.bfloat16) + var_collect = model.init(jax.random.PRNGKey(3), x) + # It is important to use var_collect here to ensure all state (e.g., quantizer states) is properly handled. If you use var_collect['params'] only, TE's state management will not work correctly for recipes that require state (e.g. DelayedScaling). + y = model.apply(var_collect, x) + return y + + # einsum_cls = None, so standard JAX computation + ref_out = test_model(einsum_type=EinsumType.JAX) + + # einsum using Transformer Engine's Float8CurrentScaling recipe + te_out = test_model(einsum_type=EinsumType.TE, quantization_recipe=Float8CurrentScaling()) + + # Compare outputs + atol = float(jnp.finfo(jnp.float8_e4m3fn).eps) + np.testing.assert_allclose(ref_out, te_out, atol=atol) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index 9543c66356..61e241b197 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -827,14 +827,26 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type TensorWrapper beta_tensor(static_cast(beta.untyped_data()), std::vector{num_gemms}, convert_ffi_datatype_to_te_dtype(beta.element_type())); - NVTEShape rhsShape{.data={num_gemms * k, n}, .ndim=2}; + NVTEShape rhsShape{.data={k, n}, .ndim=2}; + if (!is_grouped_dense_wgrad) { + rhsShape.data[0] *= num_gemms; + } + if (rhs_is_trans) { + std::swap(rhsShape.data[0], rhsShape.data[1]); + } NVTEGroupedTensor rhs_tensor = make_grouped_tensor(rhs_data, rhs_sinv, scaling_mode, num_gemms, rhsShape); NVTEShape lhsShape{.data={m, k}, .ndim=2}; + if (lhs_is_trans) { + std::swap(lhsShape.data[0], lhsShape.data[1]); + } NVTEGroupedTensor lhs_tensor = make_grouped_tensor(lhs_data, lhs_sinv, scaling_mode, num_gemms, lhsShape); NVTEShape outShape{.data={m, n}, .ndim=2}; + if (is_grouped_dense_wgrad) { + outShape.data[0] *= num_gemms; + } NVTEGroupedTensor out_tensor = make_grouped_tensor(*output, std::nullopt, JAXX_Scaling_Mode::NO_SCALING, num_gemms, outShape); - NVTE_CHECK(!rhs_is_trans && !lhs_is_trans, "TE grouped GEMM only supports non-transposed inputs but received rhs_is_trans=", rhs_is_trans, " lhs_is_trans=", lhs_is_trans); + // NVTE_CHECK(!rhs_is_trans && !lhs_is_trans, "TE grouped GEMM only supports non-transposed inputs but received rhs_is_trans=", rhs_is_trans, " lhs_is_trans=", lhs_is_trans); nvte_grouped_gemm( rhs_is_trans, lhs_is_trans, From fe39e39be1abfa46642fdde9e3ede365bc1dfb3c Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Wed, 7 Jan 2026 14:25:47 -0800 Subject: [PATCH 44/98] einsum tests pass --- transformer_engine/jax/csrc/extensions/gemm.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index 61e241b197..f49530ee1c 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -828,15 +828,16 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type convert_ffi_datatype_to_te_dtype(beta.element_type())); NVTEShape rhsShape{.data={k, n}, .ndim=2}; - if (!is_grouped_dense_wgrad) { - rhsShape.data[0] *= num_gemms; - } if (rhs_is_trans) { std::swap(rhsShape.data[0], rhsShape.data[1]); } + if (!is_grouped_dense_wgrad) { + // If is_grouped_dense_wgrad, then n already includes num_gemms (G) pre-multiplied in gemm.py, so we don't need to multiply it here. + rhsShape.data[0] *= num_gemms; + } NVTEGroupedTensor rhs_tensor = make_grouped_tensor(rhs_data, rhs_sinv, scaling_mode, num_gemms, rhsShape); NVTEShape lhsShape{.data={m, k}, .ndim=2}; - if (lhs_is_trans) { + if (lhs_is_trans && is_grouped_dense_wgrad) { std::swap(lhsShape.data[0], lhsShape.data[1]); } NVTEGroupedTensor lhs_tensor = make_grouped_tensor(lhs_data, lhs_sinv, scaling_mode, num_gemms, lhsShape); From 5e47d57b3e670d86ce37e5e2e44397158360adb4 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Thu, 8 Jan 2026 09:37:17 -0800 Subject: [PATCH 45/98] more progress, works in maxtext single-gpu and is closer to bf16 batched gemm speed --- transformer_engine/jax/cpp_extensions/gemm.py | 4 +- .../jax/csrc/extensions/gemm.cpp | 246 +----------------- .../jax/csrc/extensions/quantization.cpp | 26 +- transformer_engine/jax/flax/module.py | 4 +- 4 files changed, 27 insertions(+), 253 deletions(-) diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py index 25f3315ba7..5c53dedb8a 100644 --- a/transformer_engine/jax/cpp_extensions/gemm.py +++ b/transformer_engine/jax/cpp_extensions/gemm.py @@ -2123,8 +2123,8 @@ def grouped_gemm( assert not has_bias or bias.shape == (group_sizes.size, N) bias = jnp.empty((), jnp.float32) if bias is None else bias - print(f"{lhs_data.shape=}, {rhs_data.shape=}, {group_sizes.shape=}") - print(f"{M=}, {N=}, {K_lhs=}, {K_rhs=}") + # print(f"{lhs_data.shape=}, {rhs_data.shape=}, {group_sizes.shape=}") + # print(f"{M=}, {N=}, {K_lhs=}, {K_rhs=}") # import pdb; pdb.set_trace() # import pdb; pdb.set_trace() # print(f"{lhs_is_trans=}, {rhs_is_trans=}") diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index f49530ee1c..0bfab2d7dc 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -534,22 +534,6 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type size_t bias_dtype_bytes = te_dtype_bytes(bias_dtype); size_t out_dtype_bytes = te_dtype_bytes(out_dtype); - if (is_tensor_scaling) { - size_t dpitch = tensor_scaling_sinv_aligment; - size_t spitch = lhs_sinv_dtype_bytes; - size_t width = lhs_sinv_dtype_bytes; - size_t height = lhs_sinv_size; - cudaMemcpy2DAsync(lhs_scatter_aligned_ptr, dpitch, lhs_sinv_ptr, spitch, width, height, - cudaMemcpyDeviceToDevice, stream); - spitch = rhs_sinv_dtype_bytes; - width = rhs_sinv_dtype_bytes; - height = rhs_sinv_size; - cudaMemcpy2DAsync(rhs_scatter_aligned_ptr, dpitch, rhs_sinv_ptr, spitch, width, height, - cudaMemcpyDeviceToDevice, stream); - lhs_sinv_ptr = lhs_scatter_aligned_ptr; - rhs_sinv_ptr = rhs_scatter_aligned_ptr; - } - NVTE_CHECK(lhs_dtype_bytes == rhs_dtype_bytes, "sizeof(lhs_dtype) != sizeof(rhs_dtype)"); NVTE_CHECK(lhs_sinv_dtype_bytes == rhs_sinv_dtype_bytes, "sizeof(lhs_sinv_dtype) != sizeof(rhs_sinv_dtype)"); @@ -576,29 +560,6 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type " = ", expected_out_size, ", got ", actual_out_size); } - size_t dim_list_bytes = sizeof(int32_t) * num_gemms; - std::vector dim_list_host(num_gemms); - size_t host_num_gemms = 0; - if (use_async_d2h_group_sizes) { - host_num_gemms = GroupedGemmGetGroupSizes(stream, num_gemms, nullptr, dim_list_host.data()); - NVTE_CHECK(host_num_gemms == num_gemms, "num_gemms ", num_gemms, - " does not match the return of GroupedGemmGetGroupSizes ", host_num_gemms, "."); - } else { - auto dim_list_ptr = reinterpret_cast(group_sizes.untyped_data()); - cudaMemcpyAsync(dim_list_host.data(), dim_list_ptr, dim_list_bytes, cudaMemcpyDeviceToHost, - stream); - // Note: This may break cudaGraph. - cudaStreamSynchronize(stream); - } - size_t sum_group_sizes = std::accumulate(dim_list_host.begin(), dim_list_host.end(), 0); - if (!is_grouped_dense_wgrad) { - NVTE_CHECK(m == sum_group_sizes, "Unexpected group_sizes! M = ", m, - ", got sum(group_sizes)=", sum_group_sizes); - } else { - NVTE_CHECK(k == sum_group_sizes, "Unexpected group_sizes! K = ", k, - ", got sum(group_sizes)=", sum_group_sizes); - } - auto num_math_sm = cuda::sm_count() - getenv("NVTE_EXT_MARGIN_SM", 0); bool grad = false; bool accumulate = false; @@ -612,210 +573,6 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type "got lhs_is_trans=", lhs_is_trans, ", rhs_is_trans=", rhs_is_trans); } - // These lists are to keep the TensorWrapper objects alive - std::vector lhs_wrapper_list; - std::vector rhs_wrapper_list; - std::vector lhs_swizzle_wrapper_list; // For MXFP8 scale_inv swizzling - std::vector rhs_swizzle_wrapper_list; - std::vector bias_wrapper_list; - std::vector pre_gelu_wrapper_list; - std::vector out_wrapper_list; - std::vector workspace_wrapper_list; - - // These lists are the actual NVTETensor (void *) lists for multi-stream GEMM - std::vector lhs_list; - std::vector rhs_list; - std::vector lhs_swizzle_list; - std::vector rhs_swizzle_list; - std::vector bias_list; - std::vector pre_gelu_list; - std::vector out_list; - - size_t lhs_sinv_total_size = 0; - size_t rhs_sinv_total_size = 0; - - std::vector zero_out_dptr_list; - std::vector zero_out_size_list; - - for (size_t i = 0; i < num_gemms; i++) { - // Matrix data shapes - size_t m_i = dim_list_host[i]; - auto lhs_shape_i = std::vector{m_i, k}; - auto rhs_shape_i = std::vector{rhs_is_trans ? n : k, rhs_is_trans ? k : n}; - auto out_shape_i = std::vector{m_i, n}; - if (is_grouped_dense_wgrad) { - size_t k_i = dim_list_host[i]; - lhs_shape_i[0] = lhs_is_trans ? k_i : m; - lhs_shape_i[1] = lhs_is_trans ? m : k_i; - rhs_shape_i[0] = rhs_is_trans ? n : k_i; - rhs_shape_i[1] = rhs_is_trans ? k_i : n; - out_shape_i[0] = m; - out_shape_i[1] = n; - } - - size_t lhs_size = lhs_shape_i[0] * lhs_shape_i[1]; - size_t rhs_size = rhs_shape_i[0] * rhs_shape_i[1]; - size_t out_size = out_shape_i[0] * out_shape_i[1]; - bool is_empty_gemm = lhs_size == 0 || rhs_size == 0; - if (is_empty_gemm && out_size > 0) { - zero_out_dptr_list.push_back(out_ptr); - zero_out_size_list.push_back(out_size * out_dtype_bytes); - } - - // Set matrix data pointers - auto lhs_i = TensorWrapper(get_nvte_scaling_mode(scaling_mode)); - auto rhs_i = TensorWrapper(get_nvte_scaling_mode(scaling_mode)); - auto out_i = TensorWrapper(static_cast(out_ptr), out_shape_i, out_dtype); - void *lhs_vptr = static_cast(lhs_ptr); - void *rhs_vptr = static_cast(rhs_ptr); - if (rhs_use_colwise) // MatA to enter cuBLAS - rhs_i.set_columnwise_data(rhs_vptr, rhs_dtype, rhs_shape_i); - else - rhs_i.set_rowwise_data(rhs_vptr, rhs_dtype, rhs_shape_i); - if (lhs_use_colwise) // MatB to enter cuBLAS - lhs_i.set_columnwise_data(lhs_vptr, lhs_dtype, lhs_shape_i); - else - lhs_i.set_rowwise_data(lhs_vptr, lhs_dtype, lhs_shape_i); - - // Set scale_inv shapes and pointers - void *rhs_sinv_vptr = static_cast(rhs_sinv_ptr); - void *lhs_sinv_vptr = static_cast(lhs_sinv_ptr); - size_t lhs_sinv_size_i = 0; - size_t rhs_sinv_size_i = 0; - if (is_tensor_scaling) { - auto tensor_scaling_sinv_shape = std::vector{1}; - // If is_empty_gemm, scale_inv does not have the corresponding value, do not move the pointers - if (!is_empty_gemm) { - lhs_sinv_size_i = tensor_scaling_sinv_aligment / lhs_sinv_dtype_bytes; - rhs_sinv_size_i = tensor_scaling_sinv_aligment / rhs_sinv_dtype_bytes; - } - if (rhs_use_colwise) // MatA to enter cuBLAS - rhs_i.set_columnwise_scale_inv(rhs_sinv_vptr, rhs_sinv_dtype, tensor_scaling_sinv_shape); - else - rhs_i.set_rowwise_scale_inv(rhs_sinv_vptr, rhs_sinv_dtype, tensor_scaling_sinv_shape); - if (lhs_use_colwise) // MatB to enter cuBLAS - lhs_i.set_columnwise_scale_inv(lhs_sinv_vptr, lhs_sinv_dtype, tensor_scaling_sinv_shape); - else - lhs_i.set_rowwise_scale_inv(lhs_sinv_vptr, lhs_sinv_dtype, tensor_scaling_sinv_shape); - } else if (is_mxfp8_scaling) { - auto lhs_swizzle_i = TensorWrapper(get_nvte_scaling_mode(scaling_mode)); - auto rhs_swizzle_i = TensorWrapper(get_nvte_scaling_mode(scaling_mode)); - void *swizzled_lhs_sinv_vptr = static_cast(swizzled_lhs_sinv_ptr); - void *swizzled_rhs_sinv_vptr = static_cast(swizzled_rhs_sinv_ptr); - - // {lhs, rhs}_swizzle_i point to unswizzled scale_inv data as input, while {lhs, rhs}_i - // point to swizzled scale_inv data (store on workspace, only used for GEMM). - // Note: even if is_empty_gemm is true, sinv are still non-empty, need to move the pointers - auto lhs_sinv_shape_i = - get_block_scale_shape(scaling_mode, lhs_shape_i[0], lhs_shape_i[1], lhs_use_colwise); - auto rhs_sinv_shape_i = - get_block_scale_shape(scaling_mode, rhs_shape_i[0], rhs_shape_i[1], rhs_use_colwise); - lhs_sinv_size_i = lhs_sinv_shape_i[0] * lhs_sinv_shape_i[1]; - rhs_sinv_size_i = rhs_sinv_shape_i[0] * rhs_sinv_shape_i[1]; - if (lhs_use_colwise) { - lhs_swizzle_i.set_columnwise_data(lhs_vptr, lhs_dtype, lhs_shape_i); - lhs_swizzle_i.set_columnwise_scale_inv(lhs_sinv_vptr, lhs_sinv_dtype, lhs_sinv_shape_i); - lhs_i.set_columnwise_scale_inv(swizzled_lhs_sinv_vptr, lhs_sinv_dtype, lhs_sinv_shape_i); - } else { - lhs_swizzle_i.set_rowwise_data(lhs_vptr, lhs_dtype, lhs_shape_i); - lhs_swizzle_i.set_rowwise_scale_inv(lhs_sinv_vptr, lhs_sinv_dtype, lhs_sinv_shape_i); - lhs_i.set_rowwise_scale_inv(swizzled_lhs_sinv_vptr, lhs_sinv_dtype, lhs_sinv_shape_i); - } - if (rhs_use_colwise) { - rhs_swizzle_i.set_columnwise_data(rhs_vptr, rhs_dtype, rhs_shape_i); - rhs_swizzle_i.set_columnwise_scale_inv(rhs_sinv_vptr, rhs_sinv_dtype, rhs_sinv_shape_i); - rhs_i.set_columnwise_scale_inv(swizzled_rhs_sinv_vptr, rhs_sinv_dtype, rhs_sinv_shape_i); - } else { - rhs_swizzle_i.set_rowwise_data(rhs_vptr, rhs_dtype, rhs_shape_i); - rhs_swizzle_i.set_rowwise_scale_inv(rhs_sinv_vptr, rhs_sinv_dtype, rhs_sinv_shape_i); - rhs_i.set_rowwise_scale_inv(swizzled_rhs_sinv_vptr, rhs_sinv_dtype, rhs_sinv_shape_i); - } - - if (!is_empty_gemm) { - lhs_swizzle_wrapper_list.push_back(std::move(lhs_swizzle_i)); - rhs_swizzle_wrapper_list.push_back(std::move(rhs_swizzle_i)); - lhs_swizzle_list.push_back(lhs_swizzle_wrapper_list.back().data()); - rhs_swizzle_list.push_back(rhs_swizzle_wrapper_list.back().data()); - } - } else { - NVTE_CHECK(scaling_mode == JAXX_Scaling_Mode::NO_SCALING, - "Unsupported scaling mode: ", static_cast(scaling_mode)); - } - - auto bias_i = TensorWrapper(bias_ptr, bias_shape, bias_dtype); - auto pre_gelu_i = TensorWrapper(nullptr, std::vector{0}, out_dtype); - - // Update pointer for the next GEMM pair - lhs_ptr += lhs_size * lhs_dtype_bytes; - rhs_ptr += rhs_size * rhs_dtype_bytes; - out_ptr += out_size * out_dtype_bytes; - if (is_fp8_gemm) { - lhs_sinv_ptr += lhs_sinv_size_i * lhs_sinv_dtype_bytes; - rhs_sinv_ptr += rhs_sinv_size_i * rhs_sinv_dtype_bytes; - lhs_sinv_total_size += lhs_sinv_size_i; - rhs_sinv_total_size += rhs_sinv_size_i; - if (is_mxfp8_scaling) { - swizzled_lhs_sinv_ptr += lhs_sinv_size_i * lhs_sinv_dtype_bytes; - swizzled_rhs_sinv_ptr += rhs_sinv_size_i * rhs_sinv_dtype_bytes; - } - } - if (has_bias) bias_ptr += n * bias_dtype_bytes; - - // Move objects to the lists to keep them alive - if (is_empty_gemm) continue; - lhs_wrapper_list.push_back(std::move(lhs_i)); - rhs_wrapper_list.push_back(std::move(rhs_i)); - out_wrapper_list.push_back(std::move(out_i)); - bias_wrapper_list.push_back(std::move(bias_i)); - pre_gelu_wrapper_list.push_back(std::move(pre_gelu_i)); - - lhs_list.push_back(lhs_wrapper_list.back().data()); - rhs_list.push_back(rhs_wrapper_list.back().data()); - bias_list.push_back(bias_wrapper_list.back().data()); - pre_gelu_list.push_back(pre_gelu_wrapper_list.back().data()); - out_list.push_back(out_wrapper_list.back().data()); - } - - if (is_fp8_gemm) { - if (is_tensor_scaling) { - lhs_sinv_size *= tensor_scaling_sinv_aligment; - rhs_sinv_size *= tensor_scaling_sinv_aligment; - } - NVTE_CHECK(lhs_sinv_total_size <= lhs_sinv_size, "Actual total lhs_sinv size ", - lhs_sinv_total_size, " exceeds estimated upper bound ", lhs_sinv_size); - NVTE_CHECK(rhs_sinv_total_size <= rhs_sinv_size, "Actual total rhs_sinv size ", - rhs_sinv_total_size, " exceeds estimated upper bound ", rhs_sinv_size); - } - - size_t num_non_empty_gemms = lhs_list.size(); - - if (is_mxfp8_scaling) { - for (int i = 0; i < num_non_empty_gemms; i++) { - // The i-th GEMM will use the (i % num_streams)-th stream to compute, - // use the same stream to swizzle the scaling factors to make sure that - // the swizzling is done before the GEMM computation starts. - int stream_id = i % num_streams; - cudaStream_t stream_i = nvte_get_compute_stream(stream_id); - nvte_swizzle_scaling_factors(lhs_swizzle_list[i], lhs_list[i], stream_i); - nvte_swizzle_scaling_factors(rhs_swizzle_list[i], rhs_list[i], stream_i); - } - } - - // Launch zero-out kernels before the GEMM calls to use the sync in the multi-stream GEMM - size_t num_zero_outs = zero_out_dptr_list.size(); - for (int i = 0; i < num_zero_outs; i++) { - int stream_id = i % num_streams; - cudaStream_t stream_i = nvte_get_compute_stream(stream_id); - void *dptr = zero_out_dptr_list[i]; - size_t count = zero_out_size_list[i]; - NVTE_CHECK_CUDA(cudaMemsetAsync(dptr, 0, count, stream_i)); - } - - // nvte_multi_tensor_gemm(rhs_list.data(), lhs_list.data(), out_list.data(), bias_list.data(), - // pre_gelu_list.data(), num_non_empty_gemms, rhs_is_trans, lhs_is_trans, - // grad, workspace_list.data(), accumulate, use_split_accumulator, - // num_math_sm, stream); - constexpr size_t workspace_setup_size = 1024 * 1024; // HACK: dummy workspace for setup TensorWrapper workspace_setup(workspace_ptr, std::vector{workspace_setup_size}, DType::kByte); @@ -888,7 +645,8 @@ XLA_FFI_DEFINE_HANDLER_SYMBOL(GroupedGemmHandler, GroupedGemmFFI, .Attr("scaling_mode") .Attr("has_bias") .Attr("is_grouped_dense_wgrad") - .Attr("use_async_d2h_group_sizes")); + .Attr("use_async_d2h_group_sizes"), + FFI_CudaGraph_Traits); } // namespace jax } // namespace transformer_engine diff --git a/transformer_engine/jax/csrc/extensions/quantization.cpp b/transformer_engine/jax/csrc/extensions/quantization.cpp index 1f7db84383..ad3553313f 100644 --- a/transformer_engine/jax/csrc/extensions/quantization.cpp +++ b/transformer_engine/jax/csrc/extensions/quantization.cpp @@ -375,11 +375,24 @@ Error_Type GroupedQuantizeFFI(cudaStream_t stream, Buffer_Type inputs, Buffer_Ty size_t num_groups = group_sizes.dimensions()[0]; size_t dim_list_bytes = group_size_dtype_bytes * num_groups; std::vector dim_list_host(num_groups); - auto *group_size_ptr = reinterpret_cast(group_sizes.untyped_data()); - cudaMemcpyAsync(dim_list_host.data(), group_size_ptr, dim_list_bytes, cudaMemcpyDeviceToHost, - stream); - // Note: This may break cudaGraph. - cudaStreamSynchronize(stream); + // HACK: assumes batched gemm with equal group sizes + for (size_t i = 0; i < num_groups; i++) { + if (input_dims[0] == num_groups) { + dim_list_host[i] = 1; + continue; + } + dim_list_host[i] = m / num_groups; + } + // auto *group_size_ptr = reinterpret_cast(group_sizes.untyped_data()); + // cudaMemcpyAsync(dim_list_host.data(), group_size_ptr, dim_list_bytes, cudaMemcpyDeviceToHost, + // stream); + // // Note: This may break cudaGraph. + // cudaStreamSynchronize(stream); + // printf("GroupedQuantizeFFI: m=%zu, n=%zu, group sizes = ", m, n); + // for (size_t i = 0; i < num_groups; i++) { + // printf("%d ", dim_list_host[i]); + // } + // printf("\n"); size_t sum_group_sizes = std::accumulate(dim_list_host.begin(), dim_list_host.end(), 0); NVTE_CHECK(m == sum_group_sizes || input_dims[0] == sum_group_sizes, @@ -492,7 +505,8 @@ XLA_FFI_DEFINE_HANDLER_SYMBOL(GroupedQuantizeHandler, GroupedQuantizeFFI, .Ret() // amax .Attr("scaling_mode") .Attr("q_layout") - .Attr("flatten_axis")); + .Attr("flatten_axis"), + FFI_CudaGraph_Traits); } // namespace jax } // namespace transformer_engine diff --git a/transformer_engine/jax/flax/module.py b/transformer_engine/jax/flax/module.py index cc6088e8d2..3b4a5ef148 100644 --- a/transformer_engine/jax/flax/module.py +++ b/transformer_engine/jax/flax/module.py @@ -1444,6 +1444,7 @@ def te_dot_general(generate_quantizer_set, x, kernel, dims, **kwargs): def make_einsum_cls(quantization_recipe): import functools + import math import jax def te_einsum(generate_quantizer_set, s, x, kernel, **kwargs): # with open("/tmp/te_einsum_log.txt", "a") as f: @@ -1493,7 +1494,8 @@ def reorder_rhs_for_grouped_gemm(tensor, bdims, cdims): kernel = reorder_rhs_for_grouped_gemm(kernel, (batch_dims[1],), contracting_dims[1]) num_groups = kernel.shape[0] - group_size = x.shape[0] // num_groups + group_size = math.prod(x.shape[:-1]) // num_groups + print(f'{num_groups=}, {group_size=}, {x.shape=}, {kernel.shape=}') group_sizes = jnp.array([group_size]*num_groups, dtype=jnp.int32) From bc6cf66512bf4a4a35ce9e014768bb34f749744b Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Thu, 8 Jan 2026 10:44:12 -0800 Subject: [PATCH 46/98] attempt at passing thru stateful args for DS --- transformer_engine/jax/quantize/quantizer.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/transformer_engine/jax/quantize/quantizer.py b/transformer_engine/jax/quantize/quantizer.py index 4edc187795..6831758875 100644 --- a/transformer_engine/jax/quantize/quantizer.py +++ b/transformer_engine/jax/quantize/quantizer.py @@ -7,7 +7,7 @@ This module provides classes and utilities for quantizing tensors in JAX. """ from abc import ABC, abstractmethod -from dataclasses import dataclass, field +from dataclasses import dataclass, field, InitVar from functools import partial from typing import Union, Optional, Tuple import warnings @@ -893,6 +893,7 @@ class GroupedQuantizer(Quantizer): data_layout: str = None n_groups: int = 1 quantizers: Tuple[Quantizer] = field(default_factory=lambda: (None,)) + extra_kwargs: InitVar[dict] = None def tree_flatten(self): """Flatten the quantizer for JAX tree operations. @@ -911,10 +912,12 @@ def tree_flatten(self): ) return (children, aux_data) - def __post_init__(self): + def __post_init__(self, extra_kwargs: dict): + print(f"QuantizerFactory creating quantizers for GroupedQuantizer: {self.n_groups=}, {self.scaling_mode=}, {self.q_dtype=}, {self.q_layout=}, {extra_kwargs=}, {self.quantizers=}") if self.quantizers[0] is None: quantizers = QuantizerFactory.create( - self.n_groups, self.scaling_mode, self.q_dtype, self.q_layout + n_quantizers=self.n_groups, + scaling_mode=self.scaling_mode, q_dtype=self.q_dtype, q_layout=self.q_layout, **extra_kwargs ) self.quantizers = (quantizers,) if not isinstance(quantizers, tuple) else quantizers self.data_layout = self.quantizers[0].data_layout @@ -1106,8 +1109,14 @@ def create( warnings.warn( "Using more than one GroupedQuantizer for a grouped input is not recommended" ) - quantizer_type = GroupedQuantizer - kwargs["n_groups"] = n_groups + quantizer_type = lambda q_dtype, scaling_mode, q_layout, checkpoint_name, **kwargs: GroupedQuantizer( + q_dtype=q_dtype, + scaling_mode=scaling_mode, + q_layout=q_layout, + checkpoint_name=checkpoint_name, + n_groups=n_groups, + extra_kwargs=kwargs, + ) else: quantizer_type = QuantizerFactory.quantizer_type_map.get(scaling_mode) From bcbe864825fa8f40103c72b8b750a807490de28f Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Thu, 8 Jan 2026 10:44:18 -0800 Subject: [PATCH 47/98] Revert "attempt at passing thru stateful args for DS" This reverts commit bc6cf66512bf4a4a35ce9e014768bb34f749744b. --- transformer_engine/jax/quantize/quantizer.py | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/transformer_engine/jax/quantize/quantizer.py b/transformer_engine/jax/quantize/quantizer.py index 6831758875..4edc187795 100644 --- a/transformer_engine/jax/quantize/quantizer.py +++ b/transformer_engine/jax/quantize/quantizer.py @@ -7,7 +7,7 @@ This module provides classes and utilities for quantizing tensors in JAX. """ from abc import ABC, abstractmethod -from dataclasses import dataclass, field, InitVar +from dataclasses import dataclass, field from functools import partial from typing import Union, Optional, Tuple import warnings @@ -893,7 +893,6 @@ class GroupedQuantizer(Quantizer): data_layout: str = None n_groups: int = 1 quantizers: Tuple[Quantizer] = field(default_factory=lambda: (None,)) - extra_kwargs: InitVar[dict] = None def tree_flatten(self): """Flatten the quantizer for JAX tree operations. @@ -912,12 +911,10 @@ def tree_flatten(self): ) return (children, aux_data) - def __post_init__(self, extra_kwargs: dict): - print(f"QuantizerFactory creating quantizers for GroupedQuantizer: {self.n_groups=}, {self.scaling_mode=}, {self.q_dtype=}, {self.q_layout=}, {extra_kwargs=}, {self.quantizers=}") + def __post_init__(self): if self.quantizers[0] is None: quantizers = QuantizerFactory.create( - n_quantizers=self.n_groups, - scaling_mode=self.scaling_mode, q_dtype=self.q_dtype, q_layout=self.q_layout, **extra_kwargs + self.n_groups, self.scaling_mode, self.q_dtype, self.q_layout ) self.quantizers = (quantizers,) if not isinstance(quantizers, tuple) else quantizers self.data_layout = self.quantizers[0].data_layout @@ -1109,14 +1106,8 @@ def create( warnings.warn( "Using more than one GroupedQuantizer for a grouped input is not recommended" ) - quantizer_type = lambda q_dtype, scaling_mode, q_layout, checkpoint_name, **kwargs: GroupedQuantizer( - q_dtype=q_dtype, - scaling_mode=scaling_mode, - q_layout=q_layout, - checkpoint_name=checkpoint_name, - n_groups=n_groups, - extra_kwargs=kwargs, - ) + quantizer_type = GroupedQuantizer + kwargs["n_groups"] = n_groups else: quantizer_type = QuantizerFactory.quantizer_type_map.get(scaling_mode) From b40353fbad69d3b90197f1ea8dd28dee9263d593 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Thu, 8 Jan 2026 14:06:45 -0800 Subject: [PATCH 48/98] batch gemm specialization for CS amax calc --- .../jax/cpp_extensions/quantization.py | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/transformer_engine/jax/cpp_extensions/quantization.py b/transformer_engine/jax/cpp_extensions/quantization.py index a95afe8b8e..b8ea3bd4f4 100644 --- a/transformer_engine/jax/cpp_extensions/quantization.py +++ b/transformer_engine/jax/cpp_extensions/quantization.py @@ -1209,21 +1209,26 @@ def grouped_quantize( assert n_groups == len( quantizer.quantizers ), f"n_groups={n_groups} != n_quantizers = {len(quantizer.quantizers)}" - scale = jnp.empty((n_groups,), jnp.float32) + scale = jnp.ones((n_groups,), jnp.float32) if quantizer.scaling_mode == ScalingMode.DELAYED_TENSOR_SCALING: for i, quantizer_i in enumerate(quantizer.quantizers): scale = scale.at[i].set(quantizer_i.scale[0]) if quantizer.scaling_mode == ScalingMode.CURRENT_TENSOR_SCALING: - if amax is not None: - row_amax = amax - else: - row_amax = jnp.max(jnp.abs(x), axis=range(group_axis + 1, x.ndim)) - segment_ids = jnp.repeat( - jnp.arange(n_groups), group_sizes, total_repeat_length=x.shape[group_axis] - ) - grouped_amax = jax.ops.segment_max(row_amax, segment_ids, num_segments=n_groups) + # TODO fixme, measure perf with always scale/amax of 1 to just isolate quant and gemm + # HACK: assumes equal group sizes + assert group_axis == 0, f"Currently only group_axis = 0 is supported for current-tensor-scaling, but received {group_axis=}" + grouped_amax = jnp.max(jnp.abs(x.reshape((n_groups, x.shape[0]//n_groups, *x.shape[1:]))), axis=tuple(range(1, x.ndim+1))) + # import pdb; pdb.set_trace() + # if amax is not None: + # row_amax = amax + # else: + # row_amax = jnp.max(jnp.abs(x), axis=range(group_axis + 1, x.ndim)) + # segment_ids = jnp.repeat( + # jnp.arange(n_groups), group_sizes, total_repeat_length=x.shape[group_axis] + # ) + # grouped_amax = jax.ops.segment_max(row_amax, segment_ids, num_segments=n_groups) for i in range(n_groups): tmp_scale = compute_scale_from_amax(grouped_amax[i], quantizer.q_dtype, margin=0.0) scale = scale.at[i].set(tmp_scale[0]) From 6c5d96941522cecfffad51d68a16c2a79428012b Mon Sep 17 00:00:00 2001 From: Pawel Gadzinski Date: Fri, 9 Jan 2026 17:27:20 +0100 Subject: [PATCH 49/98] fix Signed-off-by: Pawel Gadzinski --- transformer_engine/common/gemm/config.h | 6 ++++-- .../common/gemm/cublaslt_grouped_gemm.cu | 21 +++++++++---------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/transformer_engine/common/gemm/config.h b/transformer_engine/common/gemm/config.h index b1aaae2591..cdea24ea7e 100644 --- a/transformer_engine/common/gemm/config.h +++ b/transformer_engine/common/gemm/config.h @@ -44,8 +44,10 @@ struct GroupedMatmulConfig { // Number of streaming multiprocessors to use in GEMM kernel int sm_count = 0; - static constexpr size_t attr_sizes[] = {sizeof(avg_m), sizeof(avg_n), sizeof(avg_k), - sizeof(sm_count)}; + // Note: API transfers the value type, not std::optional + static constexpr size_t attr_sizes[] = {sizeof(decltype(avg_m)::value_type), + sizeof(decltype(avg_n)::value_type), + sizeof(decltype(avg_k)::value_type), sizeof(sm_count)}; }; } // namespace transformer_engine diff --git a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu index 0d376c2e56..a03e5b516a 100644 --- a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu @@ -214,7 +214,7 @@ inline void validate_grouped_gemm_inputs(const transformer_engine::GroupedTensor // fallback to column-wise data when row-wise is absent. struct GroupedOperandSelection { const transformer_engine::GroupedTensor *tensor = nullptr; - const char *dptr = nullptr; + char *dptr = nullptr; transformer_engine::DType dtype = transformer_engine::DType::kNumTypes; bool trans = false; bool use_columnwise = false; @@ -248,7 +248,7 @@ inline GroupedOperandSelection select_grouped_operand(const transformer_engine:: if (is_A) { if (!sel.trans) { NVTE_CHECK(has_col, "Grouped GEMM: A is missing column-wise data needed for FP8 TN layout"); - sel.dptr = static_cast(t->columnwise_data.dptr); + sel.dptr = static_cast(t->columnwise_data.dptr); sel.dtype = col_dtype; sel.trans = true; // using pre-transposed storage sel.use_columnwise = true; @@ -257,7 +257,7 @@ inline GroupedOperandSelection select_grouped_operand(const transformer_engine:: } else { // B if (sel.trans) { NVTE_CHECK(has_col, "Grouped GEMM: B is missing column-wise data needed for FP8 TN layout"); - sel.dptr = static_cast(t->columnwise_data.dptr); + sel.dptr = static_cast(t->columnwise_data.dptr); sel.dtype = col_dtype; sel.trans = false; // using pre-transposed storage sel.use_columnwise = true; @@ -272,7 +272,7 @@ inline GroupedOperandSelection select_grouped_operand(const transformer_engine:: NVTE_CHECK( !is_fp8 || non_tn_fp8_ok, "Grouped GEMM: FP8 on Hopper requires row-wise data for this transpose configuration"); - sel.dptr = static_cast(t->columnwise_data.dptr); + sel.dptr = static_cast(t->columnwise_data.dptr); sel.dtype = col_dtype; sel.trans = !sel.trans; sel.use_columnwise = true; @@ -280,7 +280,7 @@ inline GroupedOperandSelection select_grouped_operand(const transformer_engine:: } // Default: use row-wise data (column-wise case already handled above) - sel.dptr = static_cast(t->data.dptr); + sel.dptr = static_cast(t->data.dptr); sel.dtype = row_dtype; sel.use_columnwise = false; return sel; @@ -414,7 +414,7 @@ __global__ void setup_grouped_gemm_kernel( void **A_ptrs, void **B_ptrs, void **C_ptrs, void **D_ptrs, int *M, int *N, int *K, float **alpha_ptrs, float **beta_ptrs, // Base pointers - const char *a_base, const char *b_base, const char *c_base, char *d_base, + char *a_base, char *b_base, char *c_base, char *d_base, // Dimension info (per tensor) TensorShapeInfo A_meta, TensorShapeInfo B_meta, TensorShapeInfo C_meta, TensorShapeInfo D_meta, // Element sizes @@ -445,10 +445,9 @@ __global__ void setup_grouped_gemm_kernel( D_meta.offsets ? D_meta.offsets[idx] : (idx * D_meta.uniform_first * D_meta.uniform_last); // Compute data pointers - // Note: const_cast is safe here - cuBLAS requires void** but won't modify A/B/C data - A_ptrs[idx] = const_cast(a_base) + a_offset * a_elem_size; - B_ptrs[idx] = const_cast(b_base) + b_offset * b_elem_size; - C_ptrs[idx] = const_cast(c_base) + c_offset * c_elem_size; + A_ptrs[idx] = a_base + a_offset * a_elem_size; + B_ptrs[idx] = b_base + b_offset * b_elem_size; + C_ptrs[idx] = c_base + c_offset * c_elem_size; D_ptrs[idx] = d_base + d_offset * d_elem_size; // Compute M, N, K dimensions from tensor shapes @@ -474,7 +473,7 @@ inline void launch_grouped_gemm_setup( TensorShapeInfo C_meta = TensorShapeInfo::create_shape_info_for_C(C, D); TensorShapeInfo D_meta = TensorShapeInfo::from_tensor(D); - const char *c_base = static_cast(C->data.dptr); + char *c_base = static_cast(C->data.dptr); char *d_base = static_cast(D->data.dptr); const size_t a_elem_size = transformer_engine::typeToSize(A_sel.dtype); From c91cd8ffa2e5ec93247a716bede7851673c94b0c Mon Sep 17 00:00:00 2001 From: Pawel Gadzinski Date: Fri, 9 Jan 2026 17:32:19 +0100 Subject: [PATCH 50/98] fix Signed-off-by: Pawel Gadzinski --- .../common/gemm/cublaslt_gemm.cu | 33 ++++++++----------- .../common/gemm/cublaslt_grouped_gemm.cu | 5 ++- .../common/util/cuda_runtime.cpp | 7 ++++ transformer_engine/common/util/cuda_runtime.h | 6 ++++ 4 files changed, 30 insertions(+), 21 deletions(-) diff --git a/transformer_engine/common/gemm/cublaslt_gemm.cu b/transformer_engine/common/gemm/cublaslt_gemm.cu index 118bf19335..7c04c14eff 100644 --- a/transformer_engine/common/gemm/cublaslt_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_gemm.cu @@ -302,13 +302,6 @@ GemmParam CanonicalizeGemmInput(const transformer_engine::Tensor &A, const cubla return ret; } -/* cuBLAS version number at run-time */ -size_t cublas_version() { - // Cache version to avoid cuBLAS logging overhead - static size_t version = cublasLtGetVersion(); - return version; -} - } // namespace namespace transformer_engine { @@ -501,8 +494,8 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD, #endif // CUBLAS_VERSION >= 120800 } else if (mxfp8_gemm) { #if CUBLAS_VERSION >= 120800 - NVTE_CHECK(cublas_version() >= 120800, - "MXFP8 requires cuBLAS 12.8+, but run-time cuBLAS version is ", cublas_version()); + NVTE_CHECK(cuda::cublas_version() >= 120800, + "MXFP8 requires cuBLAS 12.8+, but run-time cuBLAS version is ", cuda::cublas_version()); fp8e8m0 *A_scale_inverse = reinterpret_cast(param.A_scale_inv); fp8e8m0 *B_scale_inverse = reinterpret_cast(param.B_scale_inv); NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(operationDesc, @@ -515,7 +508,7 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD, scaling_mode_b = CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0; // Workaround for heuristic cache bug in cublasLt. This separates the MXFP8 cache key from non-block scaling. // CUBLASLT_MATMUL_DESC_ALPHA_VECTOR_BATCH_STRIDE is unused for block scaling so it's safe to set. - if (cublas_version() <= 120803) { + if (cuda::cublas_version() <= 120803) { const int64_t dummy_a_vec_stride = 1; NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( operationDesc, CUBLASLT_MATMUL_DESC_ALPHA_VECTOR_BATCH_STRIDE, &dummy_a_vec_stride, @@ -527,8 +520,8 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD, #endif // CUBLAS_VERSION >= 120800 } else if (use_fp4) { // NVFP4 GEMM #if CUBLAS_VERSION >= 120800 - NVTE_CHECK(cublas_version() >= 120800, - "FP4 requires cuBLAS 12.8+, but run-time cuBLAS version is ", cublas_version()); + NVTE_CHECK(cuda::cublas_version() >= 120800, + "FP4 requires cuBLAS 12.8+, but run-time cuBLAS version is ", cuda::cublas_version()); // make sure alpha beta computation dtype remains fp32 by CUBLASLT_MATMUL_DESC_SCALE_TYPE cublasDataType_t scale_type = CUDA_R_32F; NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( @@ -558,9 +551,9 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD, (inputB->scaling_mode == NVTE_BLOCK_SCALING_1D || inputB->scaling_mode == NVTE_BLOCK_SCALING_2D)) { #if CUBLAS_VERSION >= 120900 - NVTE_CHECK(cublas_version() >= 120900, + NVTE_CHECK(cuda::cublas_version() >= 120900, "FP8 block scaling requires cuBLAS 12.9+, but run-time cuBLAS version is ", - cublas_version()); + cuda::cublas_version()); float *A_scale_inverse = reinterpret_cast(param.A_scale_inv); float *B_scale_inverse = reinterpret_cast(param.B_scale_inv); NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(operationDesc, @@ -588,7 +581,7 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD, } #if CUBLAS_VERSION >= 120800 - if (cublas_version() >= 120800) { + if (cuda::cublas_version() >= 120800) { NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_A_SCALE_MODE, &scaling_mode_a, sizeof(scaling_mode_a))); @@ -605,7 +598,7 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD, NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( operationDesc, CUBLASLT_MATMUL_DESC_AMAX_D_POINTER, &D_amax, sizeof(D_amax))); #if CUBLAS_VERSION >= 120800 - if (cublas_version() >= 120800) { + if (cuda::cublas_version() >= 120800) { // NOTE: In all current cases where FP8 output is supported, the input is // scaled identically to the output. NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(operationDesc, @@ -692,9 +685,9 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD, NVTE_CHECK(cuda::cudart_version() >= 12020 && cuda::cudart_version() < 13000, "Atomic GEMM requires CUDA >=12.2.0 and <13.0.0, but run-time CUDA version is ", cuda::cudart_version()); - NVTE_CHECK(cublas_version() >= 120205 && cublas_version() < 130000, + NVTE_CHECK(cuda::cublas_version() >= 120205 && cuda::cublas_version() < 130000, "Atomic GEMM requires cuBLAS >=12.2.5 and <13.0.0, but run-time cuBLAS version is ", - cublas_version()); + cuda::cublas_version()); if (m_split == 0) m_split = 1; if (n_split == 0) n_split = 1; NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( @@ -920,9 +913,9 @@ void nvte_cublas_atomic_gemm(const NVTETensor A, const NVTETensor B, NVTETensor "Atomic GEMM requires CUDA version >=12.2.0 and <13.0.0, but run-time CUDA version is ", transformer_engine::cuda::cudart_version()); NVTE_CHECK( - cublas_version() >= 120205 && cublas_version() < 130000, + cuda::cublas_version() >= 120205 && cuda::cublas_version() < 130000, "Atomic GEMM requires cuBLAS version >=12.2.5 and <13.0.0, but run-time cuBLAS version is ", - cublas_version()); + cuda::cublas_version()); const Tensor *inputA = convertNVTETensorCheck(A); const Tensor *inputB = convertNVTETensorCheck(B); diff --git a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu index a03e5b516a..d4696c9127 100644 --- a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu @@ -507,10 +507,13 @@ void nvte_grouped_gemm(const NVTEGroupedTensor A, int transa, const NVTEGroupedT NVTE_API_CALL(nvte_grouped_gemm); using namespace transformer_engine; - // Grouped GEMM requires Blackwell (SM100) or newer + // Grouped GEMM requires Blackwell (SM100) or newer and cuBLAS 13.1+ const int current_device = cuda::current_device(); NVTE_CHECK(cuda::sm_arch(current_device) >= 100, "nvte_grouped_gemm requires Blackwell (SM100) or newer architecture."); + NVTE_CHECK(cuda::cublas_version() >= 130100, + "nvte_grouped_gemm requires cuBLAS 13.1+, but run-time cuBLAS version is ", + cuda::cublas_version()); // Convert to internal types const GroupedTensor *inputA = convertNVTEGroupedTensorCheck(A); diff --git a/transformer_engine/common/util/cuda_runtime.cpp b/transformer_engine/common/util/cuda_runtime.cpp index 2e5ef8b8e1..47cfa6bc96 100644 --- a/transformer_engine/common/util/cuda_runtime.cpp +++ b/transformer_engine/common/util/cuda_runtime.cpp @@ -6,6 +6,7 @@ #include "../util/cuda_runtime.h" +#include #include #include @@ -210,6 +211,12 @@ int cudart_version() { return version; } +size_t cublas_version() { + // Cache version to avoid cuBLAS logging overhead + static size_t version = cublasLtGetVersion(); + return version; +} + } // namespace cuda } // namespace transformer_engine diff --git a/transformer_engine/common/util/cuda_runtime.h b/transformer_engine/common/util/cuda_runtime.h index 6b999870dd..b7b9680688 100644 --- a/transformer_engine/common/util/cuda_runtime.h +++ b/transformer_engine/common/util/cuda_runtime.h @@ -73,6 +73,12 @@ const std::string &include_directory(bool required = false); */ int cudart_version(); +/* \brief cuBLAS version number at run-time + * + * Versions may differ between compile-time and run-time. + */ +size_t cublas_version(); + } // namespace cuda } // namespace transformer_engine From 0319e79ee06153c560e953548b302b4aee69b5f5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 9 Jan 2026 16:33:05 +0000 Subject: [PATCH 51/98] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- transformer_engine/common/gemm/cublaslt_gemm.cu | 6 ++++-- transformer_engine/common/util/cuda_runtime.cpp | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/transformer_engine/common/gemm/cublaslt_gemm.cu b/transformer_engine/common/gemm/cublaslt_gemm.cu index 7c04c14eff..b82fe82b63 100644 --- a/transformer_engine/common/gemm/cublaslt_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_gemm.cu @@ -495,7 +495,8 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD, } else if (mxfp8_gemm) { #if CUBLAS_VERSION >= 120800 NVTE_CHECK(cuda::cublas_version() >= 120800, - "MXFP8 requires cuBLAS 12.8+, but run-time cuBLAS version is ", cuda::cublas_version()); + "MXFP8 requires cuBLAS 12.8+, but run-time cuBLAS version is ", + cuda::cublas_version()); fp8e8m0 *A_scale_inverse = reinterpret_cast(param.A_scale_inv); fp8e8m0 *B_scale_inverse = reinterpret_cast(param.B_scale_inv); NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(operationDesc, @@ -521,7 +522,8 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD, } else if (use_fp4) { // NVFP4 GEMM #if CUBLAS_VERSION >= 120800 NVTE_CHECK(cuda::cublas_version() >= 120800, - "FP4 requires cuBLAS 12.8+, but run-time cuBLAS version is ", cuda::cublas_version()); + "FP4 requires cuBLAS 12.8+, but run-time cuBLAS version is ", + cuda::cublas_version()); // make sure alpha beta computation dtype remains fp32 by CUBLASLT_MATMUL_DESC_SCALE_TYPE cublasDataType_t scale_type = CUDA_R_32F; NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( diff --git a/transformer_engine/common/util/cuda_runtime.cpp b/transformer_engine/common/util/cuda_runtime.cpp index 47cfa6bc96..0e8ff58b7c 100644 --- a/transformer_engine/common/util/cuda_runtime.cpp +++ b/transformer_engine/common/util/cuda_runtime.cpp @@ -7,6 +7,7 @@ #include "../util/cuda_runtime.h" #include + #include #include From a14d5bc25a50ff8e6f1b68448ef35bed521049cc Mon Sep 17 00:00:00 2001 From: Pawel Gadzinski Date: Tue, 13 Jan 2026 19:11:45 +0100 Subject: [PATCH 52/98] refactored hopper tensor selection Signed-off-by: Pawel Gadzinski --- tests/cpp/operator/test_grouped_gemm.cu | 4 +- .../common/gemm/cublaslt_grouped_gemm.cu | 185 +++++++++++------- 2 files changed, 115 insertions(+), 74 deletions(-) diff --git a/tests/cpp/operator/test_grouped_gemm.cu b/tests/cpp/operator/test_grouped_gemm.cu index 90d89c77c8..35c4375cbe 100644 --- a/tests/cpp/operator/test_grouped_gemm.cu +++ b/tests/cpp/operator/test_grouped_gemm.cu @@ -44,8 +44,8 @@ enum class ShapeCase { size_t grouped_setup_workspace_size(const size_t num_tensors) { const size_t ptr_bytes = num_tensors * sizeof(void*); const size_t int_bytes = num_tensors * sizeof(int); - // Layout: 6 pointer arrays (A, B, C, D, alpha, beta) + 3 int arrays (M, N, K) - size_t size = 6 * ptr_bytes + 3 * int_bytes; + // Layout: 6 pointer arrays (A, B, C, D, alpha, beta) + 6 int arrays (a_rows, a_cols, b_rows, b_cols, d_rows, d_cols) + size_t size = 6 * ptr_bytes + 6 * int_bytes; const size_t alignment = 256; size = ((size + alignment - 1) / alignment) * alignment; return size; diff --git a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu index d4696c9127..c1a75f0523 100644 --- a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu @@ -107,11 +107,15 @@ struct GroupedGemmSetupWorkspace { void **B_ptrs; void **C_ptrs; void **D_ptrs; - int *M; - int *N; - int *K; float **alpha_ptrs; float **beta_ptrs; + // Storage dimensions for cuBLAS matrix layouts + int *a_rows; + int *a_cols; + int *b_rows; + int *b_cols; + int *d_rows; // M (first dim) - also used for C + int *d_cols; // N (last dim) - also used for C // Initialize from workspace buffer // Layout: all pointer arrays first (8-byte aligned), then int arrays (4-byte aligned) @@ -135,22 +139,28 @@ struct GroupedGemmSetupWorkspace { ws.beta_ptrs = reinterpret_cast(setup_ws_ptr + offset); offset += ptr_size; - // Int arrays last (4-byte aligned, always satisfied after pointer arrays) - ws.M = reinterpret_cast(setup_ws_ptr + offset); + // Int arrays for storage dimensions (4-byte aligned) + ws.a_rows = reinterpret_cast(setup_ws_ptr + offset); offset += int_size; - ws.N = reinterpret_cast(setup_ws_ptr + offset); + ws.a_cols = reinterpret_cast(setup_ws_ptr + offset); offset += int_size; - ws.K = reinterpret_cast(setup_ws_ptr + offset); + ws.b_rows = reinterpret_cast(setup_ws_ptr + offset); + offset += int_size; + ws.b_cols = reinterpret_cast(setup_ws_ptr + offset); + offset += int_size; + ws.d_rows = reinterpret_cast(setup_ws_ptr + offset); + offset += int_size; + ws.d_cols = reinterpret_cast(setup_ws_ptr + offset); return ws; } - // Calculate required size for setup workspace (pointer arrays + M/N/K) + // Calculate required size for setup workspace static size_t required_setup_size(size_t num_tensors, size_t alignment) { const size_t ptr_size = num_tensors * sizeof(void *); const size_t int_size = num_tensors * sizeof(int); - // Layout: 6 ptr arrays, then 3 int arrays (no padding needed) - size_t size = 6 * ptr_size + 3 * int_size; + // Layout: 6 ptr arrays, then 6 int arrays + size_t size = 6 * ptr_size + 6 * int_size; size = ((size + alignment - 1) / alignment) * alignment; return size; } @@ -212,14 +222,44 @@ inline void validate_grouped_gemm_inputs(const transformer_engine::GroupedTensor // Select row-wise vs column-wise storage and adjust transpose flag for grouped GEMM. // Mirrors the non-grouped GEMM logic for FP8 layout handling (TN-only on Hopper) and // fallback to column-wise data when row-wise is absent. +// Contains all information needed for GEMM setup - shape already accounts for storage layout. struct GroupedOperandSelection { - const transformer_engine::GroupedTensor *tensor = nullptr; + TensorShapeInfo shape; // Shape info with dims already swapped for columnwise if needed char *dptr = nullptr; + void *scale_inv = nullptr; transformer_engine::DType dtype = transformer_engine::DType::kNumTypes; bool trans = false; - bool use_columnwise = false; }; +// Helper to create TensorShapeInfo from a GroupedTensor, optionally swapping first/last dims. +// When swap_dims=true, first_dims and last_dims are swapped to account for columnwise storage. +// Note: tensor_offsets are the same for rowwise and columnwise data (same element count per tensor). +inline TensorShapeInfo create_shape_info(const transformer_engine::GroupedTensor *t, + bool swap_dims) { + const bool has_first = t->first_dims.has_data(); + const bool has_last = t->last_dims.has_data(); + NVTE_CHECK(has_first || t->all_same_first_dim(), + "GroupedTensor is missing first_dims for varying shapes"); + NVTE_CHECK(has_last || t->all_same_last_dim(), + "GroupedTensor is missing last_dims for varying shapes"); + + const int64_t *first_ptr = + has_first ? static_cast(t->first_dims.dptr) : nullptr; + const int64_t *last_ptr = has_last ? static_cast(t->last_dims.dptr) : nullptr; + const int64_t uniform_first = has_first ? 0 : static_cast(t->get_common_first_dim()); + const int64_t uniform_last = has_last ? 0 : static_cast(t->get_common_last_dim()); + + const int64_t *offsets_ptr = t->tensor_offsets.has_data() + ? static_cast(t->tensor_offsets.dptr) + : nullptr; + + if (swap_dims) { + // Swap first/last to account for columnwise (transposed) storage + return {last_ptr, first_ptr, offsets_ptr, uniform_last, uniform_first}; + } + return {first_ptr, last_ptr, offsets_ptr, uniform_first, uniform_last}; +} + inline GroupedOperandSelection select_grouped_operand(const transformer_engine::GroupedTensor *t, bool trans, bool is_A) { using namespace transformer_engine; @@ -236,31 +276,42 @@ inline GroupedOperandSelection select_grouped_operand(const transformer_engine:: const DType row_dtype = t->data.dtype; const DType col_dtype = t->columnwise_data.dtype; GroupedOperandSelection sel; - sel.tensor = t; sel.trans = trans; const DType rep_dtype = has_row ? row_dtype : col_dtype; const bool is_fp8 = is_fp8_dtype(rep_dtype); const bool non_tn_fp8_ok = nvte_is_non_tn_fp8_gemm_supported(); + // Helper to select columnwise storage (swaps dims in shape) + auto use_columnwise = [&]() { + sel.dptr = static_cast(t->columnwise_data.dptr); + sel.scale_inv = t->columnwise_scale_inv.dptr; + sel.dtype = col_dtype; + sel.shape = create_shape_info(t, /*swap_dims=*/true); + }; + + // Helper to select row-wise storage + auto use_rowwise = [&]() { + sel.dptr = static_cast(t->data.dptr); + sel.scale_inv = t->scale_inv.dptr; + sel.dtype = row_dtype; + sel.shape = create_shape_info(t, /*swap_dims=*/false); + }; + // Hopper-style TN-only FP8: force TN by switching layout and flipping transpose when needed. if (is_fp8 && !non_tn_fp8_ok) { if (is_A) { if (!sel.trans) { NVTE_CHECK(has_col, "Grouped GEMM: A is missing column-wise data needed for FP8 TN layout"); - sel.dptr = static_cast(t->columnwise_data.dptr); - sel.dtype = col_dtype; + use_columnwise(); sel.trans = true; // using pre-transposed storage - sel.use_columnwise = true; return sel; } } else { // B if (sel.trans) { NVTE_CHECK(has_col, "Grouped GEMM: B is missing column-wise data needed for FP8 TN layout"); - sel.dptr = static_cast(t->columnwise_data.dptr); - sel.dtype = col_dtype; + use_columnwise(); sel.trans = false; // using pre-transposed storage - sel.use_columnwise = true; return sel; } } @@ -272,17 +323,13 @@ inline GroupedOperandSelection select_grouped_operand(const transformer_engine:: NVTE_CHECK( !is_fp8 || non_tn_fp8_ok, "Grouped GEMM: FP8 on Hopper requires row-wise data for this transpose configuration"); - sel.dptr = static_cast(t->columnwise_data.dptr); - sel.dtype = col_dtype; - sel.trans = !sel.trans; - sel.use_columnwise = true; + use_columnwise(); + sel.trans = !trans; // flip transpose for pre-transposed storage return sel; } - // Default: use row-wise data (column-wise case already handled above) - sel.dptr = static_cast(t->data.dptr); - sel.dtype = row_dtype; - sel.use_columnwise = false; + // Default: use row-wise data + use_rowwise(); return sel; } @@ -307,23 +354,15 @@ inline void init_matrix_layouts(cublasLtMatrixLayoutOpaque_t &descA, const cudaDataType_t B_type = get_cuda_dtype(B_sel.dtype); const cudaDataType_t D_type = get_cuda_dtype(D->dtype()); - // For column-major layout: leading dimension is the number of rows in storage. - // If columnwise data was chosen, storage is already transposed. - // Storage dimensions for A: rows_A x cols_A with leading dimension lda_storage - int *rows_A = A_sel.use_columnwise ? ws.M : (A_sel.trans ? ws.K : ws.M); - int *cols_A = A_sel.use_columnwise ? ws.K : (A_sel.trans ? ws.M : ws.K); - int *lda_storage = rows_A; - // Storage dimensions for B: rows_B x cols_B with leading dimension ldb_storage - int *rows_B = B_sel.use_columnwise ? ws.N : (B_sel.trans ? ws.N : ws.K); - int *cols_B = B_sel.use_columnwise ? ws.K : (B_sel.trans ? ws.K : ws.N); - int *ldb_storage = rows_B; - - NVTE_CHECK_CUBLAS( - cublasLtGroupedMatrixLayoutInit(&descA, A_type, num_tensors, rows_A, cols_A, lda_storage)); - NVTE_CHECK_CUBLAS( - cublasLtGroupedMatrixLayoutInit(&descB, B_type, num_tensors, rows_B, cols_B, ldb_storage)); - NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descC, D_type, num_tensors, ws.M, ws.N, ws.M)); - NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descD, D_type, num_tensors, ws.M, ws.N, ws.M)); + // Storage dimensions computed by kernel, leading dimension = rows + NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descA, A_type, num_tensors, ws.a_rows, + ws.a_cols, ws.a_rows)); + NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descB, B_type, num_tensors, ws.b_rows, + ws.b_cols, ws.b_rows)); + NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descC, D_type, num_tensors, ws.d_rows, + ws.d_cols, ws.d_rows)); + NVTE_CHECK_CUBLAS(cublasLtGroupedMatrixLayoutInit(&descD, D_type, num_tensors, ws.d_rows, + ws.d_cols, ws.d_rows)); } inline void init_matmul_desc(cublasLtMatmulDescOpaque_t &matmulDesc, cublasOperation_t op_A, @@ -356,15 +395,13 @@ inline void set_fp8_scale_pointers(cublasLtMatmulDescOpaque_t &matmulDesc, if (!is_fp8_a && !is_fp8_b) return; if (is_fp8_a) { - void *a_scale_inv = A_sel.use_columnwise ? A_sel.tensor->columnwise_scale_inv.dptr - : A_sel.tensor->scale_inv.dptr; + void *a_scale_inv = A_sel.scale_inv; NVTE_CHECK(a_scale_inv != nullptr, "FP8 grouped GEMM: A scale_inv is required"); NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( &matmulDesc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &a_scale_inv, sizeof(a_scale_inv))); } if (is_fp8_b) { - void *b_scale_inv = B_sel.use_columnwise ? B_sel.tensor->columnwise_scale_inv.dptr - : B_sel.tensor->scale_inv.dptr; + void *b_scale_inv = B_sel.scale_inv; NVTE_CHECK(b_scale_inv != nullptr, "FP8 grouped GEMM: B scale_inv is required"); NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( &matmulDesc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &b_scale_inv, sizeof(b_scale_inv))); @@ -406,24 +443,19 @@ inline cublasLtMatmulAlgo_t select_grouped_gemm_algo(cublasLtHandle_t handle, } // Single kernel that sets up all GEMM parameters. -// Rationale: cuBLASLt grouped matmul API needs flat arrays of pointers and per-matrix M/N/K, +// Rationale: cuBLASLt grouped matmul API needs flat arrays of pointers and per-matrix dimensions, // but NVTEGroupedTensor stores a single contiguous buffer + optional per-tensor offsets/shapes. -// We bridge the mismatch on GPU by computing per-group pointers and dims in one kernel. +// We bridge the mismatch on GPU by computing per-group pointers and storage dims in one kernel. __global__ void setup_grouped_gemm_kernel( // Output arrays - void **A_ptrs, void **B_ptrs, void **C_ptrs, void **D_ptrs, int *M, int *N, int *K, + void **A_ptrs, void **B_ptrs, void **C_ptrs, void **D_ptrs, + int *a_rows, int *a_cols, int *b_rows, int *b_cols, int *d_rows, int *d_cols, float **alpha_ptrs, float **beta_ptrs, - // Base pointers + // Inputs char *a_base, char *b_base, char *c_base, char *d_base, - // Dimension info (per tensor) TensorShapeInfo A_meta, TensorShapeInfo B_meta, TensorShapeInfo C_meta, TensorShapeInfo D_meta, - // Element sizes size_t a_elem_size, size_t b_elem_size, size_t c_elem_size, size_t d_elem_size, - // Alpha/beta pointers (per-matrix arrays) float *alpha_ptr, float *beta_ptr, - // Transpose flags - bool transa, bool transb, - // Number of tensors size_t num_tensors) { size_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= num_tensors) return; @@ -433,6 +465,8 @@ __global__ void setup_grouped_gemm_kernel( int64_t a_last = A_meta.last_dims ? A_meta.last_dims[idx] : A_meta.uniform_last; int64_t b_first = B_meta.first_dims ? B_meta.first_dims[idx] : B_meta.uniform_first; int64_t b_last = B_meta.last_dims ? B_meta.last_dims[idx] : B_meta.uniform_last; + int64_t d_first = D_meta.first_dims ? D_meta.first_dims[idx] : D_meta.uniform_first; + int64_t d_last = D_meta.last_dims ? D_meta.last_dims[idx] : D_meta.uniform_last; // Compute offsets (from array or compute from uniform dims) int64_t a_offset = @@ -450,12 +484,16 @@ __global__ void setup_grouped_gemm_kernel( C_ptrs[idx] = c_base + c_offset * c_elem_size; D_ptrs[idx] = d_base + d_offset * d_elem_size; - // Compute M, N, K dimensions from tensor shapes - // Input A is stored as {K,M} when !transa, {M,K} when transa - // Input B is stored as {N,K} when !transb, {K,N} when transb - M[idx] = static_cast(transa ? a_first : a_last); - K[idx] = static_cast(transa ? a_last : a_first); - N[idx] = static_cast(transb ? b_last : b_first); + // Compute storage dimensions for cuBLAS matrix layouts. + // For INPUTS (A, B): Row-wise storage is seen as transposed column-major by cuBLAS, + // so rows=last, cols=first. For columnwise, dims are already swapped. + a_rows[idx] = static_cast(a_last); + a_cols[idx] = static_cast(a_first); + b_rows[idx] = static_cast(b_last); + b_cols[idx] = static_cast(b_first); + // For OUTPUTS (D, C): cuBLAS writes in column-major, so rows=first (M), cols=last (N). + d_rows[idx] = static_cast(d_first); + d_cols[idx] = static_cast(d_last); // Fill alpha/beta pointers (per-matrix) alpha_ptrs[idx] = alpha_ptr + idx; @@ -468,8 +506,9 @@ inline void launch_grouped_gemm_setup( const GroupedOperandSelection &B_sel, const transformer_engine::GroupedTensor *C, const transformer_engine::GroupedTensor *D, const transformer_engine::Tensor *alpha_tensor, const transformer_engine::Tensor *beta_tensor, size_t num_tensors, cudaStream_t stream) { - TensorShapeInfo A_meta = TensorShapeInfo::from_tensor(A_sel.tensor); - TensorShapeInfo B_meta = TensorShapeInfo::from_tensor(B_sel.tensor); + // Use shape info from selection (already accounts for columnwise dimension swap) + TensorShapeInfo A_meta = A_sel.shape; + TensorShapeInfo B_meta = B_sel.shape; TensorShapeInfo C_meta = TensorShapeInfo::create_shape_info_for_C(C, D); TensorShapeInfo D_meta = TensorShapeInfo::from_tensor(D); @@ -485,10 +524,11 @@ inline void launch_grouped_gemm_setup( const int num_blocks = (num_tensors + threads_per_block - 1) / threads_per_block; setup_grouped_gemm_kernel<<>>( - ws.A_ptrs, ws.B_ptrs, ws.C_ptrs, ws.D_ptrs, ws.M, ws.N, ws.K, ws.alpha_ptrs, ws.beta_ptrs, - A_sel.dptr, B_sel.dptr, c_base, d_base, A_meta, B_meta, C_meta, D_meta, a_elem_size, - b_elem_size, c_elem_size, d_elem_size, static_cast(alpha_tensor->data.dptr), - static_cast(beta_tensor->data.dptr), A_sel.trans, B_sel.trans, num_tensors); + ws.A_ptrs, ws.B_ptrs, ws.C_ptrs, ws.D_ptrs, ws.a_rows, ws.a_cols, ws.b_rows, ws.b_cols, + ws.d_rows, ws.d_cols, ws.alpha_ptrs, ws.beta_ptrs, A_sel.dptr, B_sel.dptr, c_base, d_base, + A_meta, B_meta, C_meta, D_meta, a_elem_size, b_elem_size, c_elem_size, d_elem_size, + static_cast(alpha_tensor->data.dptr), + static_cast(beta_tensor->data.dptr), num_tensors); NVTE_CHECK_CUDA(cudaGetLastError()); } @@ -577,10 +617,11 @@ void nvte_grouped_gemm(const NVTEGroupedTensor A, int transa, const NVTEGroupedT // Compute average dimensions for heuristics // K dimension: if transa, K is A's first dim; if not, K is A's last dim + // Use original inputA and transa for heuristics (not modified A_sel.trans) int64_t avg_m_val = config_.avg_m.value_or(compute_avg_first_dim(outputD)); int64_t avg_n_val = config_.avg_n.value_or(compute_avg_last_dim(outputD)); - int64_t avg_k_val = config_.avg_k.value_or(A_sel.trans ? compute_avg_first_dim(A_sel.tensor) - : compute_avg_last_dim(A_sel.tensor)); + int64_t avg_k_val = config_.avg_k.value_or(transa ? compute_avg_first_dim(inputA) + : compute_avg_last_dim(inputA)); // Heuristic selection cublasLtMatmulAlgo_t algo = select_grouped_gemm_algo(handle, matmulDesc, descA, descB, descC, From c5c2fbf59388234dd0f402d36eff708ec3fbb684 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 13 Jan 2026 18:14:39 +0000 Subject: [PATCH 53/98] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../common/gemm/cublaslt_grouped_gemm.cu | 28 ++++++++----------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu index c1a75f0523..a1206474ea 100644 --- a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu @@ -243,15 +243,13 @@ inline TensorShapeInfo create_shape_info(const transformer_engine::GroupedTensor NVTE_CHECK(has_last || t->all_same_last_dim(), "GroupedTensor is missing last_dims for varying shapes"); - const int64_t *first_ptr = - has_first ? static_cast(t->first_dims.dptr) : nullptr; + const int64_t *first_ptr = has_first ? static_cast(t->first_dims.dptr) : nullptr; const int64_t *last_ptr = has_last ? static_cast(t->last_dims.dptr) : nullptr; const int64_t uniform_first = has_first ? 0 : static_cast(t->get_common_first_dim()); const int64_t uniform_last = has_last ? 0 : static_cast(t->get_common_last_dim()); - const int64_t *offsets_ptr = t->tensor_offsets.has_data() - ? static_cast(t->tensor_offsets.dptr) - : nullptr; + const int64_t *offsets_ptr = + t->tensor_offsets.has_data() ? static_cast(t->tensor_offsets.dptr) : nullptr; if (swap_dims) { // Swap first/last to account for columnwise (transposed) storage @@ -448,14 +446,12 @@ inline cublasLtMatmulAlgo_t select_grouped_gemm_algo(cublasLtHandle_t handle, // We bridge the mismatch on GPU by computing per-group pointers and storage dims in one kernel. __global__ void setup_grouped_gemm_kernel( // Output arrays - void **A_ptrs, void **B_ptrs, void **C_ptrs, void **D_ptrs, - int *a_rows, int *a_cols, int *b_rows, int *b_cols, int *d_rows, int *d_cols, - float **alpha_ptrs, float **beta_ptrs, + void **A_ptrs, void **B_ptrs, void **C_ptrs, void **D_ptrs, int *a_rows, int *a_cols, + int *b_rows, int *b_cols, int *d_rows, int *d_cols, float **alpha_ptrs, float **beta_ptrs, // Inputs - char *a_base, char *b_base, char *c_base, char *d_base, - TensorShapeInfo A_meta, TensorShapeInfo B_meta, TensorShapeInfo C_meta, TensorShapeInfo D_meta, - size_t a_elem_size, size_t b_elem_size, size_t c_elem_size, size_t d_elem_size, - float *alpha_ptr, float *beta_ptr, + char *a_base, char *b_base, char *c_base, char *d_base, TensorShapeInfo A_meta, + TensorShapeInfo B_meta, TensorShapeInfo C_meta, TensorShapeInfo D_meta, size_t a_elem_size, + size_t b_elem_size, size_t c_elem_size, size_t d_elem_size, float *alpha_ptr, float *beta_ptr, size_t num_tensors) { size_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= num_tensors) return; @@ -527,8 +523,8 @@ inline void launch_grouped_gemm_setup( ws.A_ptrs, ws.B_ptrs, ws.C_ptrs, ws.D_ptrs, ws.a_rows, ws.a_cols, ws.b_rows, ws.b_cols, ws.d_rows, ws.d_cols, ws.alpha_ptrs, ws.beta_ptrs, A_sel.dptr, B_sel.dptr, c_base, d_base, A_meta, B_meta, C_meta, D_meta, a_elem_size, b_elem_size, c_elem_size, d_elem_size, - static_cast(alpha_tensor->data.dptr), - static_cast(beta_tensor->data.dptr), num_tensors); + static_cast(alpha_tensor->data.dptr), static_cast(beta_tensor->data.dptr), + num_tensors); NVTE_CHECK_CUDA(cudaGetLastError()); } @@ -620,8 +616,8 @@ void nvte_grouped_gemm(const NVTEGroupedTensor A, int transa, const NVTEGroupedT // Use original inputA and transa for heuristics (not modified A_sel.trans) int64_t avg_m_val = config_.avg_m.value_or(compute_avg_first_dim(outputD)); int64_t avg_n_val = config_.avg_n.value_or(compute_avg_last_dim(outputD)); - int64_t avg_k_val = config_.avg_k.value_or(transa ? compute_avg_first_dim(inputA) - : compute_avg_last_dim(inputA)); + int64_t avg_k_val = + config_.avg_k.value_or(transa ? compute_avg_first_dim(inputA) : compute_avg_last_dim(inputA)); // Heuristic selection cublasLtMatmulAlgo_t algo = select_grouped_gemm_algo(handle, matmulDesc, descA, descB, descC, From ee71c96552c4065bee9826992e1cadfd9556c012 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Thu, 15 Jan 2026 10:35:06 -0800 Subject: [PATCH 54/98] multi-GPU grouped quantize working now in shard_map (with hack to use single-stream for multi tensor quantize) --- transformer_engine/common/cast/cast.cu | 22 +---------- .../jax/cpp_extensions/quantization.py | 21 ++++------ .../jax/csrc/extensions/quantization.cpp | 39 +++++++------------ transformer_engine/jax/flax/__init__.py | 3 +- transformer_engine/jax/flax/module.py | 20 ++++++++++ transformer_engine/jax/sharding.py | 6 ++- 6 files changed, 49 insertions(+), 62 deletions(-) diff --git a/transformer_engine/common/cast/cast.cu b/transformer_engine/common/cast/cast.cu index 73467d7275..dc77a35886 100644 --- a/transformer_engine/common/cast/cast.cu +++ b/transformer_engine/common/cast/cast.cu @@ -75,29 +75,9 @@ void nvte_multi_tensor_quantize(const NVTETensor *inputs, NVTETensor *outputs, constexpr bool IS_ACT = false; - const size_t num_streams = nvte_get_num_compute_streams(); - - int num_stream_used = std::min(num_streams, num_tensors); - // wait for current stream to finish - NVTE_CHECK_CUDA(cudaEventRecord(detail::get_compute_stream_event(0), stream)); - for (int s = 0; s < num_stream_used; s++) { - NVTE_CHECK_CUDA( - cudaStreamWaitEvent(detail::get_compute_stream(s), detail::get_compute_stream_event(0))); - } - for (int i = 0; i < num_tensors; i++) { dispatch::quantize_fwd_helper( - inputs[i], outputs[i], quant_configs, detail::get_compute_stream(i % num_streams)); - } - - // record events on compute streams - for (int s = 0; s < num_stream_used; s++) { - NVTE_CHECK_CUDA( - cudaEventRecord(detail::get_compute_stream_event(s), detail::get_compute_stream(s))); - } - // wait for all compute streams to finish - for (int s = 0; s < num_stream_used; s++) { - NVTE_CHECK_CUDA(cudaStreamWaitEvent(stream, detail::get_compute_stream_event(s))); + inputs[i], outputs[i], quant_configs, stream); } } diff --git a/transformer_engine/jax/cpp_extensions/quantization.py b/transformer_engine/jax/cpp_extensions/quantization.py index b8ea3bd4f4..4a2c001f5b 100644 --- a/transformer_engine/jax/cpp_extensions/quantization.py +++ b/transformer_engine/jax/cpp_extensions/quantization.py @@ -1216,19 +1216,14 @@ def grouped_quantize( scale = scale.at[i].set(quantizer_i.scale[0]) if quantizer.scaling_mode == ScalingMode.CURRENT_TENSOR_SCALING: - # TODO fixme, measure perf with always scale/amax of 1 to just isolate quant and gemm - # HACK: assumes equal group sizes - assert group_axis == 0, f"Currently only group_axis = 0 is supported for current-tensor-scaling, but received {group_axis=}" - grouped_amax = jnp.max(jnp.abs(x.reshape((n_groups, x.shape[0]//n_groups, *x.shape[1:]))), axis=tuple(range(1, x.ndim+1))) - # import pdb; pdb.set_trace() - # if amax is not None: - # row_amax = amax - # else: - # row_amax = jnp.max(jnp.abs(x), axis=range(group_axis + 1, x.ndim)) - # segment_ids = jnp.repeat( - # jnp.arange(n_groups), group_sizes, total_repeat_length=x.shape[group_axis] - # ) - # grouped_amax = jax.ops.segment_max(row_amax, segment_ids, num_segments=n_groups) + if amax is not None: + row_amax = amax + else: + row_amax = jnp.max(jnp.abs(x), axis=range(group_axis + 1, x.ndim)) + segment_ids = jnp.repeat( + jnp.arange(n_groups), group_sizes, total_repeat_length=x.shape[group_axis] + ) + grouped_amax = jax.ops.segment_max(row_amax, segment_ids, num_segments=n_groups) for i in range(n_groups): tmp_scale = compute_scale_from_amax(grouped_amax[i], quantizer.q_dtype, margin=0.0) scale = scale.at[i].set(tmp_scale[0]) diff --git a/transformer_engine/jax/csrc/extensions/quantization.cpp b/transformer_engine/jax/csrc/extensions/quantization.cpp index ad3553313f..2b7beb8d6b 100644 --- a/transformer_engine/jax/csrc/extensions/quantization.cpp +++ b/transformer_engine/jax/csrc/extensions/quantization.cpp @@ -375,29 +375,19 @@ Error_Type GroupedQuantizeFFI(cudaStream_t stream, Buffer_Type inputs, Buffer_Ty size_t num_groups = group_sizes.dimensions()[0]; size_t dim_list_bytes = group_size_dtype_bytes * num_groups; std::vector dim_list_host(num_groups); - // HACK: assumes batched gemm with equal group sizes - for (size_t i = 0; i < num_groups; i++) { - if (input_dims[0] == num_groups) { - dim_list_host[i] = 1; - continue; - } - dim_list_host[i] = m / num_groups; - } - // auto *group_size_ptr = reinterpret_cast(group_sizes.untyped_data()); - // cudaMemcpyAsync(dim_list_host.data(), group_size_ptr, dim_list_bytes, cudaMemcpyDeviceToHost, - // stream); - // // Note: This may break cudaGraph. - // cudaStreamSynchronize(stream); - // printf("GroupedQuantizeFFI: m=%zu, n=%zu, group sizes = ", m, n); - // for (size_t i = 0; i < num_groups; i++) { - // printf("%d ", dim_list_host[i]); - // } - // printf("\n"); - - size_t sum_group_sizes = std::accumulate(dim_list_host.begin(), dim_list_host.end(), 0); - NVTE_CHECK(m == sum_group_sizes || input_dims[0] == sum_group_sizes, - "Unexpected group_sizes! Got %zu (M=%zu, input_dims[0] = %zu)", sum_group_sizes, m, - input_dims[0]); + auto *group_size_ptr = reinterpret_cast(group_sizes.untyped_data()); + cudaMemcpyAsync(dim_list_host.data(), group_size_ptr, dim_list_bytes, cudaMemcpyDeviceToHost, + stream); + // Note: This may break cudaGraph. + cudaStreamSynchronize(stream); + + // For MaxText case, I think is okay if this check fails as we are expecting to overallocate the buffers in the current use_ring_of_experts impl, which will result in the group sizes not filling the whole tensor. + // size_t sum_group_sizes = std::accumulate(dim_list_host.begin(), dim_list_host.end(), 0); + // NVTE_CHECK(m == sum_group_sizes || input_dims[0] == sum_group_sizes, + // "Unexpected group_sizes! Got ", sum_group_sizes, " (M=", m, ", input_dims[0] = ", input_dims[0], ")"); + + // TODO(jberchtold): This is a temporary fix to zero out the output buffers to prevent NaNs in output when this buffer is over-allocated and the groups do not fill the whole buffer. Though these NaNs should be ignored in the downstream GEMM, so more debugging is needed to see why they cause issues. + cudaMemsetAsync(outputs->untyped_data(), 0, outputs->size_bytes(), stream); if (is_delayed_scaling) { NVTE_CHECK(amaxs->dimensions()[0] == num_groups, "Unexpected amax size, Expected ", num_groups, @@ -505,8 +495,7 @@ XLA_FFI_DEFINE_HANDLER_SYMBOL(GroupedQuantizeHandler, GroupedQuantizeFFI, .Ret() // amax .Attr("scaling_mode") .Attr("q_layout") - .Attr("flatten_axis"), - FFI_CudaGraph_Traits); + .Attr("flatten_axis")); } // namespace jax } // namespace transformer_engine diff --git a/transformer_engine/jax/flax/__init__.py b/transformer_engine/jax/flax/__init__.py index 59a0958b7b..1a19685697 100644 --- a/transformer_engine/jax/flax/__init__.py +++ b/transformer_engine/jax/flax/__init__.py @@ -4,7 +4,7 @@ """Transformer Engine bindings for JAX""" from .module import DenseGeneral, LayerNorm from .module import LayerNormDenseGeneral, LayerNormMLP -from .module import wrap_function_in_te_state_module, make_dot_general_cls, make_einsum_cls +from .module import wrap_function_in_te_state_module, make_dot_general_cls, make_einsum_cls, make_ragged_dot_cls from .transformer import extend_logical_axis_rules from .transformer import DotProductAttention, MultiHeadAttention, RelativePositionBiases from .transformer import TransformerLayer, TransformerLayerType @@ -17,6 +17,7 @@ "wrap_function_in_te_state_module", "make_dot_general_cls", "make_einsum_cls", + "make_ragged_dot_cls", "extend_logical_axis_rules", "DotProductAttention", "MultiHeadAttention", diff --git a/transformer_engine/jax/flax/module.py b/transformer_engine/jax/flax/module.py index 3b4a5ef148..03d5581ae6 100644 --- a/transformer_engine/jax/flax/module.py +++ b/transformer_engine/jax/flax/module.py @@ -1520,3 +1520,23 @@ def reorder_rhs_for_grouped_gemm(tensor, bdims, cdims): return jnp.einsum(s, x, kernel, _dot_general=dot_general, **kwargs) return wrap_function_in_te_state_module(te_einsum, quantization_recipe, "einsum")() + +def make_ragged_dot_cls(quantization_recipe): + import jax + def te_grouped_dot_general(generate_quantizer_set, x, kernel, group_sizes, **kwargs): + num_groups = group_sizes.shape[0] + quantizer_set = generate_quantizer_set(n_groups=num_groups) + + target_out_shape = jax.lax.ragged_dot(x, kernel, group_sizes=group_sizes).shape + + out = grouped_dense( + x, + kernel, + group_sizes=group_sizes, + contracting_dims=((1,), (1,)), + quantizer_set=quantizer_set + ) + + return out.reshape(target_out_shape) + + return wrap_function_in_te_state_module(te_grouped_dot_general, quantization_recipe, "ragged_dot")() diff --git a/transformer_engine/jax/sharding.py b/transformer_engine/jax/sharding.py index b4b8c42027..4171d1c7b0 100644 --- a/transformer_engine/jax/sharding.py +++ b/transformer_engine/jax/sharding.py @@ -51,7 +51,8 @@ def _get_mesh_info(resource: str, mesh: jax.sharding.Mesh): return mesh.shape[resource], resource -def _validate_mesh_resource_configuration(mesh_resource): +# TODO(jberchtold): FIXME, this validation fails in FP8CS amax reduction because the GlobalMeshResource is set but there is no active mesh in the context (afaict shard_map does not share it's mesh as a context), so this is triggering a FalsePositive assert. However, I am not sure if we can safely ignore this when the mesh is empty or all axes are manual as some users may use shard_map with some axes manual and some auto. +# def _validate_mesh_resource_configuration(mesh_resource): """Validate that the mesh resource configuration is consistent and conflict-free.""" is_tp_enabled = ( mesh_resource.tp_resource is not None and get_mesh_axis_size(mesh_resource.tp_resource) > 1 @@ -375,7 +376,8 @@ def global_mesh_resource() -> MeshResource: " context. If you are not using multiple GPUs, you can use an empty MeshResource by" " wrapping your program in 'with global_shard_guard(MeshResource()):'" ) - _validate_mesh_resource_configuration(_GLOBAL_MESH_RESOURCE) + # TODO(jberchtold): FIXME, this validation fails in FP8CS amax reduction because the GlobalMeshResource is set but there is no active mesh in the context (afaict shard_map does not share it's mesh as a context), so this is triggering a FalsePositive assert. However, I am not sure if we can safely ignore this when the mesh is empty or all axes are manual as some users may use shard_map with some axes manual and some auto. + # _validate_mesh_resource_configuration(_GLOBAL_MESH_RESOURCE) return _GLOBAL_MESH_RESOURCE From 9856862450547b2cbd688f30dc4fa8ecda111227 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Thu, 15 Jan 2026 11:11:55 -0800 Subject: [PATCH 55/98] reduce size of zero'ing memset to only uninitialized part of quantization buffer --- .../jax/csrc/extensions/quantization.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/transformer_engine/jax/csrc/extensions/quantization.cpp b/transformer_engine/jax/csrc/extensions/quantization.cpp index 2b7beb8d6b..3d98126290 100644 --- a/transformer_engine/jax/csrc/extensions/quantization.cpp +++ b/transformer_engine/jax/csrc/extensions/quantization.cpp @@ -382,13 +382,10 @@ Error_Type GroupedQuantizeFFI(cudaStream_t stream, Buffer_Type inputs, Buffer_Ty cudaStreamSynchronize(stream); // For MaxText case, I think is okay if this check fails as we are expecting to overallocate the buffers in the current use_ring_of_experts impl, which will result in the group sizes not filling the whole tensor. - // size_t sum_group_sizes = std::accumulate(dim_list_host.begin(), dim_list_host.end(), 0); + size_t sum_group_sizes = std::accumulate(dim_list_host.begin(), dim_list_host.end(), 0); // NVTE_CHECK(m == sum_group_sizes || input_dims[0] == sum_group_sizes, // "Unexpected group_sizes! Got ", sum_group_sizes, " (M=", m, ", input_dims[0] = ", input_dims[0], ")"); - // TODO(jberchtold): This is a temporary fix to zero out the output buffers to prevent NaNs in output when this buffer is over-allocated and the groups do not fill the whole buffer. Though these NaNs should be ignored in the downstream GEMM, so more debugging is needed to see why they cause issues. - cudaMemsetAsync(outputs->untyped_data(), 0, outputs->size_bytes(), stream); - if (is_delayed_scaling) { NVTE_CHECK(amaxs->dimensions()[0] == num_groups, "Unexpected amax size, Expected ", num_groups, ", got ", amaxs->dimensions()[0]); @@ -402,6 +399,13 @@ Error_Type GroupedQuantizeFFI(cudaStream_t stream, Buffer_Type inputs, Buffer_Ty size_t num_non_empty_groups = 0; size_t total_rowwise_sinv_size = 0; size_t total_colwise_sinv_size = 0; + + + // TODO(jberchtold): This is a temporary fix to zero out the output buffers to prevent NaNs in output when this buffer is over-allocated and the groups do not fill the whole buffer. Though these NaNs should be ignored in the downstream GEMM, so more debugging is needed to see why they cause issues. + size_t used_output_size = (sum_group_sizes*non_group_m) * n * output_dtype_bytes; + cudaMemsetAsync(outputs->untyped_data() + used_output_size, 0, outputs->size_bytes() - used_output_size, stream); + + for (size_t i = 0; i < num_groups; i++) { size_t m_i = dim_list_host[i] * non_group_m; // Skip for zero-size input + shiff the scale ptr From 23b5de303865ec8c560f9f4fee55015edddf43cf Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Thu, 15 Jan 2026 15:12:14 -0800 Subject: [PATCH 56/98] fix TE/JAX to work compile with latest nvte_grouped_gemm API changes --- transformer_engine/jax/cpp_extensions/gemm.py | 7 ------- transformer_engine/jax/csrc/extensions/gemm.cpp | 16 ++++++---------- 2 files changed, 6 insertions(+), 17 deletions(-) diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py index 5c53dedb8a..38d21f26ec 100644 --- a/transformer_engine/jax/cpp_extensions/gemm.py +++ b/transformer_engine/jax/cpp_extensions/gemm.py @@ -2123,13 +2123,6 @@ def grouped_gemm( assert not has_bias or bias.shape == (group_sizes.size, N) bias = jnp.empty((), jnp.float32) if bias is None else bias - # print(f"{lhs_data.shape=}, {rhs_data.shape=}, {group_sizes.shape=}") - # print(f"{M=}, {N=}, {K_lhs=}, {K_rhs=}") - # import pdb; pdb.set_trace() - # import pdb; pdb.set_trace() - # print(f"{lhs_is_trans=}, {rhs_is_trans=}") - # import pdb; pdb.set_trace() - num_gemms = group_sizes.shape[0] alpha = jnp.ones((num_gemms,), jnp.float32) beta = jnp.zeros((num_gemms,), jnp.float32) diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index 0bfab2d7dc..13feef709a 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -604,21 +604,17 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type } NVTEGroupedTensor out_tensor = make_grouped_tensor(*output, std::nullopt, JAXX_Scaling_Mode::NO_SCALING, num_gemms, outShape); - // NVTE_CHECK(!rhs_is_trans && !lhs_is_trans, "TE grouped GEMM only supports non-transposed inputs but received rhs_is_trans=", rhs_is_trans, " lhs_is_trans=", lhs_is_trans); - nvte_grouped_gemm( - rhs_is_trans, lhs_is_trans, - alpha_tensor.data(), - rhs_tensor, lhs_tensor, - beta_tensor.data(), + rhs_tensor, rhs_is_trans, + lhs_tensor, lhs_is_trans, nullptr, out_tensor, + alpha_tensor.data(), + beta_tensor.data(), workspace_setup.data(), workspace_cublas.data(), - stream, - nullptr, - nullptr, - nullptr); + nullptr, // config (use defaults) + stream); return ffi_with_cuda_error_check(); } From 179aab63d57b31298b427929c83955052779e201 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Tue, 20 Jan 2026 16:21:46 -0800 Subject: [PATCH 57/98] some tests starting to work --- tests/jax/test_custom_call_compute.py | 93 +++++--- transformer_engine/jax/cpp_extensions/gemm.py | 28 ++- .../jax/cpp_extensions/quantization.py | 2 +- .../jax/csrc/extensions/gemm.cpp | 221 ++++++++++++++---- transformer_engine/jax/flax/module.py | 4 +- transformer_engine/jax/permutation.py | 4 + 6 files changed, 270 insertions(+), 82 deletions(-) diff --git a/tests/jax/test_custom_call_compute.py b/tests/jax/test_custom_call_compute.py index 082a99cd8b..5c8c5d1b48 100644 --- a/tests/jax/test_custom_call_compute.py +++ b/tests/jax/test_custom_call_compute.py @@ -1761,50 +1761,68 @@ def ref_func(x, gamma, kernel_1, kernel_2, bias_1, bias_2): GROUPED_DENSE_INPUT_SHAPES = [ # (n_groups, m, n, k), the actual m will be multiplied by 32 - (5, 32, 128, 64), # Test the case where n_groups is not a multiple of 4 - (8, 64, 32, 128), - (8, 64, 128, 256), + # (5, 32, 128, 64), # Test the case where n_groups is not a multiple of 4 + + # (4, 16, 4, 4), + + (3, 192, 64, 96), + + # (8, 64, 32, 128), + # (8, 64, 128, 256), ] @pytest_parametrize_wrapper("input_shape", GROUPED_DENSE_INPUT_SHAPES) class TestGroupedDense: def _ref_grouped_dense(self, lhs, rhs, bias, group_sizes, contracting_dims): - lhs_contract_dim, _ = contracting_dims - assert len(lhs_contract_dim) == 1 and lhs.ndim == 2 and rhs.ndim == 3 - if bias is None: - bias = jnp.zeros((rhs.shape[0], rhs.shape[2]), dtype=lhs.dtype) - else: - assert bias.ndim == 2 and bias.shape == (rhs.shape[0], rhs.shape[2]) - remaining_axis = (set(range(lhs.ndim)) - set(lhs_contract_dim)).pop() - lhs = jnp.split(lhs, jnp.cumulative_sum(group_sizes)[:-1], axis=remaining_axis) - rhs = jnp.split(rhs, rhs.shape[0], axis=0) - bias = jnp.split(bias, bias.shape[0], axis=0) - ref_out = [] - dim_num = (contracting_dims, ((), ())) - for lhs_i, rhs_i, bias_i in zip(lhs, rhs, bias): - out_i = jax.lax.dot_general( - lhs_i, rhs_i, dim_num, precision=jax.lax.Precision.HIGHEST - ) + jnp.expand_dims(bias_i, axis=0) - ref_out.append(jnp.squeeze(out_i)) - return ref_out + out = jax.lax.ragged_dot(lhs, rhs, group_sizes) + print(f"In ref grouped dense: {lhs.shape=}, {rhs.shape=}, {out.shape=}") + return out + + dot_dimension_numbers = (((), ()), contracting_dims) + lhs_ragged_dimensions = (0,) + rhs_group_dimensions = (0,) + print(lhs.shape, rhs.shape, group_sizes, dot_dimension_numbers, lhs_ragged_dimensions, rhs_group_dimensions) + dims = jax.lax.RaggedDotDimensionNumbers(dot_dimension_numbers, lhs_ragged_dimensions, rhs_group_dimensions) + return jax.lax.ragged_dot_general(lhs, rhs, group_sizes, dims) + # lhs_contract_dim, _ = contracting_dims + # assert len(lhs_contract_dim) == 1 and lhs.ndim == 2 and rhs.ndim == 3 + # if bias is None: + # bias = jnp.zeros((rhs.shape[0], rhs.shape[2]), dtype=lhs.dtype) + # else: + # assert bias.ndim == 2 and bias.shape == (rhs.shape[0], rhs.shape[2]) + # remaining_axis = (set(range(lhs.ndim)) - set(lhs_contract_dim)).pop() + # lhs = jnp.split(lhs, jnp.cumulative_sum(group_sizes)[:-1], axis=remaining_axis) + # rhs = jnp.split(rhs, rhs.shape[0], axis=0) + # bias = jnp.split(bias, bias.shape[0], axis=0) + # ref_out = [] + # dim_num = (contracting_dims, ((), ())) + # for lhs_i, rhs_i, bias_i in zip(lhs, rhs, bias): + # out_i = jax.lax.dot_general( + # lhs_i, rhs_i, dim_num, precision=jax.lax.Precision.HIGHEST + # ) + jnp.expand_dims(bias_i, axis=0) + # ref_out.append(jnp.squeeze(out_i)) + # return ref_out def _generate_grouped_dense_input(self, dtype, input_shape, data_layout="NN", with_bias=False): key = jax.random.PRNGKey(0) subkeys = jax.random.split(key, 4) n_groups, m, n, k = input_shape - group_sizes = jnp.sort(jax.random.randint(subkeys[0], (n_groups - 1,), 0, m)) - group_sizes = jnp.concatenate([jnp.array([0]), group_sizes, jnp.array([m])]) - group_sizes = jnp.diff(group_sizes) - # Make one empty input lhs to test empty GEMM handling - group_sizes = group_sizes.at[0].set(group_sizes[0] + group_sizes[1]) - group_sizes = group_sizes.at[1].set(0) + # group_sizes = jnp.sort(jax.random.randint(subkeys[0], (n_groups - 1,), 0, m)) + # group_sizes = jnp.concatenate([jnp.array([0]), group_sizes, jnp.array([m])]) + # group_sizes = jnp.diff(group_sizes) + + # # Make one empty input lhs to test empty GEMM handling + # group_sizes = group_sizes.at[0].set(group_sizes[0] + group_sizes[1]) + # group_sizes = group_sizes.at[1].set(0) + + group_sizes = jnp.full((n_groups,), m // n_groups) assert group_sizes.sum() == m # *32 to make sure that input shape works for MXFP8 - group_sizes = group_sizes * 32 - m = m * 32 + # group_sizes = group_sizes * 32 + # m = m * 32 lhs_shape = (m if data_layout[0] == "N" else k, k if data_layout[0] == "N" else m) rhs_shape = (n_groups, k if data_layout[1] == "N" else n, n if data_layout[1] == "N" else k) @@ -1822,9 +1840,15 @@ def _generate_grouped_dense_input(self, dtype, input_shape, data_layout="NN", wi def _assert_grouped_gemm_output(self, out, group_sizes, ref_list, dtype): assert out.dtype == ref_list[0].dtype - out_list = jnp.split(out, jnp.cumulative_sum(group_sizes)[:-1], axis=0) - for i in range(len(ref_list)): - assert_allclose(out_list[i], ref_list[i], dtype=dtype) + import numpy as np + np.set_printoptions(threshold=10000) + jnp.set_printoptions(threshold=10000) + print("Actual:", out) + print("Expected:", ref_list) + assert_allclose(out, ref_list, dtype=dtype) + # out_list = jnp.split(out, jnp.cumulative_sum(group_sizes)[:-1], axis=0) + # for i in range(len(ref_list)): + # assert_allclose(out_list[i], ref_list[i], dtype=dtype) @pytest_parametrize_wrapper("dtype", [jnp.bfloat16, jnp.float16]) @pytest_parametrize_wrapper("layout", ["NN"]) @@ -1979,7 +2003,7 @@ def test_grouped_dense_grad_fp8(self, fwd_bwd_dtype, scaling_mode, input_shape): # ('ij,jk->ik', (64, 32), (32, 128)), # ('bij,bjk->bik', (8, 64, 32), (8, 32, 128)), # ('abc,cde->abde', (4, 8, 16), (16, 32, 64)), - ('BSM,BSEC->EBCM', (2, 4096, 4096), (2, 4096, 8, 1024)), + ('BSM,BSEC->EBCM', (2, 16, 16), (2, 16, 8, 8)), ('EBCM,EMH->EBCH', (8, 2, 1024, 4096), (8, 4096, 14336)) , ('EBCM,EMH->EBCH', (8, 2, 1024, 4096), (8, 4096, 14336)), ('EBCH,EHM->EBCM', (8, 2, 1024, 14336), (8, 14336, 4096)), @@ -2014,6 +2038,9 @@ def test_einsum_fwd(self, eqn, a_shape, b_shape, dtype, quantization_recipe): te_out = jax.jit(functools.partial(self._te_einsum, eqn, quantization_recipe=quantization_recipe))(a, b) ref_out = jax.jit(functools.partial(self._ref_einsum, eqn))(a, b) + # jax.config.update("jax_numpy_rank_promotion", "raise") + # jnp.set_printoptions(threshold=jnp.inf, linewidth=jnp.inf) + # print(te_out) assert_allclose(te_out, ref_out, dtype=dtype) def test_einsum_fwd_and_bwd(self, eqn, a_shape, b_shape, dtype, quantization_recipe): diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py index 38d21f26ec..23d774b2ca 100644 --- a/transformer_engine/jax/cpp_extensions/gemm.py +++ b/transformer_engine/jax/cpp_extensions/gemm.py @@ -1543,7 +1543,8 @@ def abstract( out_shape = (M, N) if is_grouped_dense_wgrad: - out_shape = (group_sizes_aval.size, M, N) + num_tensors = group_sizes_aval.size // 2 # packed int32 -> logical int64 shape + out_shape = (num_tensors, M, N) out_aval = jax.core.ShapedArray(shape=out_shape, dtype=out_dtype) return (out_aval, workspace_aval) @@ -1980,8 +1981,6 @@ def grouped_gemm( lhs: [M, K] or [K, N] rhs: [G, N, K] or [G, K, N] or [G * K, N] or [N, G * K] """ - # TODO(Phuong): implement the group_offset - group_offset = group_offset or jnp.zeros((1,), jnp.int32) # TODO(Phuong): implement the precision del precision @@ -2117,13 +2116,29 @@ def grouped_gemm( else: assert group_sizes.size == rhs_shape[0] - assert group_offset.size == 1 - has_bias = bias is not None assert not has_bias or bias.shape == (group_sizes.size, N) bias = jnp.empty((), jnp.float32) if bias is None else bias - num_gemms = group_sizes.shape[0] + + if group_offset is None: + # Compute group_offset as cumulative sum of group_sizes, starting with 0 + group_offset = jnp.concatenate([jnp.array([0], dtype=jnp.int32), jnp.cumsum(group_sizes, dtype=jnp.int32)[:-1]]) + group_offset *= K_lhs # Offset is by number of elements total, not number of rows + + jax.debug.print("group_sizes: {}, group_offset: {}", group_sizes, group_offset) + jax.debug.print("M={}, jnp.sum(group_sizes)={}, N={}, K_lhs={}", M, jnp.sum(group_sizes), N, K_lhs) + jax.debug.print("lhs_data.size={}, group_offset={}", lhs_data.size, group_offset) + + # print(f"{lhs_data.shape=}, {rhs_data.shape=}, {M=}, {N=}, {K_lhs=}") + + # Interlace zeros with group_sizes to upcast packed int32s to int64 + # This ensures proper alignment and prevents overflow issues + zeros = jnp.zeros_like(group_sizes, dtype=jnp.int32) + group_sizes = jnp.stack([group_sizes, zeros], axis=1).flatten() + group_offset = jnp.stack([group_offset, zeros], axis=1).flatten() + + num_gemms = group_sizes.shape[0] // 2 # Due to interlaced zeros to support int64 alpha = jnp.ones((num_gemms,), jnp.float32) beta = jnp.zeros((num_gemms,), jnp.float32) (out,) = GroupedGemmPrimitive.outer_primitive.bind( @@ -2147,4 +2162,5 @@ def grouped_gemm( is_grouped_dense_wgrad=is_grouped_dense_wgrad, use_async_d2h_group_sizes=use_async_d2h_group_sizes, ) + print(f"GroupedGemm: {lhs_data.shape=}, {rhs_data.shape=}, {out.shape=}") return out diff --git a/transformer_engine/jax/cpp_extensions/quantization.py b/transformer_engine/jax/cpp_extensions/quantization.py index 4a2c001f5b..8add335fbf 100644 --- a/transformer_engine/jax/cpp_extensions/quantization.py +++ b/transformer_engine/jax/cpp_extensions/quantization.py @@ -96,7 +96,7 @@ def abstract( dtype = dtypes.canonicalize_dtype(x_aval.dtype) assert dtype in [jnp.float32, jnp.float16, jnp.bfloat16] out_shape = x_aval.shape - assert scale_aval is None or scale_aval.dtype == jnp.float32 + assert scale_aval is None or scale_aval.dtype == jnp.float32, f"scale must be float32 but received {scale_aval}" if stochastic_rounding: assert ScalingMode( scaling_mode diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index 13feef709a..108a6b6843 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -399,37 +399,58 @@ XLA_FFI_DEFINE_HANDLER_SYMBOL(GroupedGemmD2HGroupSizesHandler, GroupedGemmD2HGro .Ret() // dummy_output .Attr("num_gemms")); -NVTEGroupedTensor make_grouped_tensor(Buffer_Type const& data, std::optional scale_inv, JAXX_Scaling_Mode scaling_mode, size_t num_tensors, NVTEShape const& dataShape) { - // printf("make_grouped_tensor data shape: "); - // for (auto dim : data.dimensions()) { - // printf("%zu, ", dim); - // } - // printf("\n"); - // NVTEShape logical_shape{}; - // if (data.dimensions().size() == 1) { - // // HACK - // size_t cdim_size = 4096; - // logical_shape.ndim = 2; - // logical_shape.data[0] = data.dimensions()[0] / cdim_size; - // logical_shape.data[1] = cdim_size; - // printf("NUM TENSORS: %zu\n", num_tensors); - // } - // else { - // NVTE_CHECK(data.dimensions().size() == 2, "Expected 2D tensor for GEMM operand but received ndim=", data.dimensions().size()); - - // logical_shape.ndim = 2; - // logical_shape.data[0] = data.dimensions()[0]; - // logical_shape.data[1] = data.dimensions()[1]; - // } - - NVTEGroupedTensor grouped_tensor = nvte_create_grouped_tensor(get_nvte_scaling_mode(scaling_mode), num_tensors, dataShape); - - NVTEBasicTensor data_tensor{reinterpret_cast(data.untyped_data()), - static_cast(convert_ffi_datatype_to_te_dtype(data.element_type())), - dataShape}; - nvte_set_grouped_tensor_param(&grouped_tensor, kNVTEGroupedRowwiseData, &data_tensor); +class JAXX_GroupedTensorWrapper { +public: + JAXX_GroupedTensorWrapper() = delete; + JAXX_GroupedTensorWrapper(JAXX_Scaling_Mode scaling_mode, size_t num_tensors, + NVTEShape const& dataShape); + ~JAXX_GroupedTensorWrapper() = default; + + void set_rowwise(Buffer_Type const& data, std::optional const& scale_inv); + void set_group_info(Buffer_Type const& group_sizes, Buffer_Type const& group_offsets); + + operator NVTEGroupedTensor() const { return m_grouped_tensor; } + NVTEGroupedTensor const& get_grouped_tensor() const; + +private: + NVTEShape m_data_shape{}; + NVTEGroupedTensor m_grouped_tensor{}; + + // Internal tensors. These need to be kept alive as long as the grouped tensor is alive. + NVTEBasicTensor m_data_tensor{}; + NVTEBasicTensor m_scale_inv_tensor{}; + + NVTEBasicTensor m_sizes_tensor{}; + NVTEBasicTensor m_offsets_tensor{}; +}; + +JAXX_GroupedTensorWrapper::JAXX_GroupedTensorWrapper(JAXX_Scaling_Mode scaling_mode, + size_t num_tensors, + NVTEShape const& dataShape) { + m_data_shape = dataShape; + m_grouped_tensor = nvte_create_grouped_tensor(get_nvte_scaling_mode(scaling_mode), num_tensors, dataShape); +} + +void JAXX_GroupedTensorWrapper::set_rowwise(Buffer_Type const& data, + std::optional const& scale_inv) { + printf("set_rowwise data shape: XLA buffer shape: "); + for (auto dim : data.dimensions()) { + printf("%zu, ", dim); + } + printf("NVTEShape: "); + for (int i = 0; i < m_data_shape.ndim; ++i) { + printf("%d, ", m_data_shape.data[i]); + } + printf("\n"); + NVTEDType data_dtype = static_cast(convert_ffi_datatype_to_te_dtype(data.element_type())); + m_data_tensor = NVTEBasicTensor{reinterpret_cast(data.untyped_data()), data_dtype, + m_data_shape}; + + nvte_set_grouped_tensor_param(&m_grouped_tensor, kNVTEGroupedRowwiseData, &m_data_tensor); if (scale_inv.has_value()) { + NVTEDType scale_inv_dtype = + static_cast(convert_ffi_datatype_to_te_dtype(scale_inv->element_type())); NVTEShape logical_scale_shape{}; if (scale_inv->dimensions().size() == 1) { logical_scale_shape.ndim = 1; @@ -439,20 +460,116 @@ NVTEGroupedTensor make_grouped_tensor(Buffer_Type const& data, std::optionaldimensions()[0]; logical_scale_shape.data[1] = scale_inv->dimensions()[1]; } else { - NVTE_CHECK(false, "Expected 1D or 2D tensor for GEMM scale_inv but received ndim=", scale_inv->dimensions().size()); + NVTE_CHECK(false, "Expected 1D or 2D tensor for GEMM scale_inv but received ndim=", + scale_inv->dimensions().size()); } - NVTEBasicTensor scale_inv_tensor{reinterpret_cast(scale_inv->untyped_data()), - static_cast(convert_ffi_datatype_to_te_dtype(scale_inv->element_type())), - logical_scale_shape}; - nvte_set_grouped_tensor_param(&grouped_tensor, kNVTEGroupedRowwiseScaleInv, &scale_inv_tensor); + m_scale_inv_tensor = NVTEBasicTensor{reinterpret_cast(scale_inv->untyped_data()), + scale_inv_dtype, logical_scale_shape}; + nvte_set_grouped_tensor_param(&m_grouped_tensor, kNVTEGroupedRowwiseScaleInv, + &m_scale_inv_tensor); } +} + +void JAXX_GroupedTensorWrapper::set_group_info(Buffer_Type const& group_sizes, + Buffer_Type const& group_offsets) { + NVTEDType sizes_dtype = + static_cast(convert_ffi_datatype_to_te_dtype(group_sizes.element_type())); + NVTEDType offsets_dtype = + static_cast(convert_ffi_datatype_to_te_dtype(group_offsets.element_type())); + + NVTE_CHECK(sizes_dtype == NVTEDType::kNVTEInt32, + "group_sizes must be of type int32."); + NVTE_CHECK(offsets_dtype == NVTEDType::kNVTEInt32, + "group_offsets must be of type int32."); + + // JAX only supports int32 but cuBLAS requires int64 so we pack two int32 into one int64 + size_t num_tensors = group_sizes.dimensions()[0] / 2; + NVTE_CHECK(group_sizes.dimensions().size() == 1, + "group_sizes must be a 1D tensor with length equal to the number of tensors."); + NVTE_CHECK(group_offsets.dimensions().size() == 1, + "group_offsets must be a 1D tensor with length equal to the number of tensors."); + NVTE_CHECK(group_offsets.dimensions()[0] == 2 * num_tensors, + "group_sizes and group_offsets must have the same number of elements."); + + NVTEShape shape{}; + shape.ndim = 1; + shape.data[0] = num_tensors; + + m_sizes_tensor = NVTEBasicTensor{reinterpret_cast(group_sizes.untyped_data()), + NVTEDType::kNVTEInt64, + shape}; + m_offsets_tensor = NVTEBasicTensor{reinterpret_cast(group_offsets.untyped_data()), + NVTEDType::kNVTEInt64, + shape}; + + nvte_set_grouped_tensor_param(&m_grouped_tensor, kNVTEGroupedFirstDims, &m_sizes_tensor); + nvte_set_grouped_tensor_param(&m_grouped_tensor, kNVTEGroupedTensorOffsets, &m_offsets_tensor); +} + +NVTEGroupedTensor const& JAXX_GroupedTensorWrapper::get_grouped_tensor() const { + return m_grouped_tensor; +} + +JAXX_GroupedTensorWrapper make_grouped_tensor(Buffer_Type const& data, std::optional scale_inv, JAXX_Scaling_Mode scaling_mode, size_t num_tensors, NVTEShape const& dataShape) { + JAXX_GroupedTensorWrapper grouped_tensor_wrapper(scaling_mode, num_tensors, dataShape); + grouped_tensor_wrapper.set_rowwise(data, scale_inv); - return grouped_tensor; + return grouped_tensor_wrapper; } +// NVTEGroupedTensor make_grouped_tensor(Buffer_Type const& data, std::optional scale_inv, JAXX_Scaling_Mode scaling_mode, size_t num_tensors, NVTEShape const& dataShape) { +// // printf("make_grouped_tensor data shape: "); +// // for (auto dim : data.dimensions()) { +// // printf("%zu, ", dim); +// // } +// // printf("\n"); +// // NVTEShape logical_shape{}; +// // if (data.dimensions().size() == 1) { +// // // HACK +// // size_t cdim_size = 4096; +// // logical_shape.ndim = 2; +// // logical_shape.data[0] = data.dimensions()[0] / cdim_size; +// // logical_shape.data[1] = cdim_size; +// // printf("NUM TENSORS: %zu\n", num_tensors); +// // } +// // else { +// // NVTE_CHECK(data.dimensions().size() == 2, "Expected 2D tensor for GEMM operand but received ndim=", data.dimensions().size()); + +// // logical_shape.ndim = 2; +// // logical_shape.data[0] = data.dimensions()[0]; +// // logical_shape.data[1] = data.dimensions()[1]; +// // } + +// NVTEGroupedTensor grouped_tensor = nvte_create_grouped_tensor(get_nvte_scaling_mode(scaling_mode), num_tensors, dataShape); + +// NVTEBasicTensor data_tensor{reinterpret_cast(data.untyped_data()), +// static_cast(convert_ffi_datatype_to_te_dtype(data.element_type())), +// dataShape}; +// nvte_set_grouped_tensor_param(&grouped_tensor, kNVTEGroupedRowwiseData, &data_tensor); + +// if (scale_inv.has_value()) { +// NVTEShape logical_scale_shape{}; +// if (scale_inv->dimensions().size() == 1) { +// logical_scale_shape.ndim = 1; +// logical_scale_shape.data[0] = scale_inv->dimensions()[0]; +// } else if (scale_inv->dimensions().size() == 2) { +// logical_scale_shape.ndim = 2; +// logical_scale_shape.data[0] = scale_inv->dimensions()[0]; +// logical_scale_shape.data[1] = scale_inv->dimensions()[1]; +// } else { +// NVTE_CHECK(false, "Expected 1D or 2D tensor for GEMM scale_inv but received ndim=", scale_inv->dimensions().size()); +// } +// NVTEBasicTensor scale_inv_tensor{reinterpret_cast(scale_inv->untyped_data()), +// static_cast(convert_ffi_datatype_to_te_dtype(scale_inv->element_type())), +// logical_scale_shape}; +// nvte_set_grouped_tensor_param(&grouped_tensor, kNVTEGroupedRowwiseScaleInv, &scale_inv_tensor); +// } + +// return grouped_tensor; +// } Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type lhs_sinv, Buffer_Type rhs_data, Buffer_Type rhs_sinv, Buffer_Type bias, - Buffer_Type group_sizes, Buffer_Type group_offset, + Buffer_Type group_sizes, Buffer_Type group_offsets, Buffer_Type alpha, Buffer_Type beta, Result_Type output, Result_Type workspace, size_t m, size_t n, size_t k, bool lhs_is_trans, @@ -487,7 +604,7 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type auto bias_dtype = convert_ffi_datatype_to_te_dtype(bias.element_type()); NVTE_CHECK(group_sizes.dimensions().size() == 1); - size_t num_gemms = group_sizes.dimensions()[0]; + size_t num_gemms = group_sizes.dimensions()[0] / 2; // JAX only supports int32 but cuBLAS requires int64 so we pack two int32 into one int64 // It is weird that TE/Common GEMM only use colwise for MXFP8 const bool is_fp8_gemm = is_fp8_dtype(lhs_dtype); @@ -584,6 +701,10 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type TensorWrapper beta_tensor(static_cast(beta.untyped_data()), std::vector{num_gemms}, convert_ffi_datatype_to_te_dtype(beta.element_type())); + + printf("Num gemms: %zu, M: %zu, N: %zu, K: %zu, group_sizes: %zu\n", num_gemms, m, n, k, group_sizes.dimensions()[0] / 2); + + //// RHS NVTEShape rhsShape{.data={k, n}, .ndim=2}; if (rhs_is_trans) { std::swap(rhsShape.data[0], rhsShape.data[1]); @@ -592,17 +713,37 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type // If is_grouped_dense_wgrad, then n already includes num_gemms (G) pre-multiplied in gemm.py, so we don't need to multiply it here. rhsShape.data[0] *= num_gemms; } - NVTEGroupedTensor rhs_tensor = make_grouped_tensor(rhs_data, rhs_sinv, scaling_mode, num_gemms, rhsShape); + auto rhs_tensor = make_grouped_tensor(rhs_data, std::nullopt, JAXX_Scaling_Mode::NO_SCALING,/*rhs_sinv, scaling_mode,*/ num_gemms, rhsShape); + + //// LHS NVTEShape lhsShape{.data={m, k}, .ndim=2}; if (lhs_is_trans && is_grouped_dense_wgrad) { std::swap(lhsShape.data[0], lhsShape.data[1]); } - NVTEGroupedTensor lhs_tensor = make_grouped_tensor(lhs_data, lhs_sinv, scaling_mode, num_gemms, lhsShape); + auto lhs_tensor = make_grouped_tensor(lhs_data, std::nullopt, JAXX_Scaling_Mode::NO_SCALING,/*lhs_sinv, scaling_mode,*/ num_gemms, lhsShape); + if (!is_grouped_dense_wgrad) { + lhs_tensor.set_group_info(group_sizes, group_offsets); + } + + //// OUTPUT NVTEShape outShape{.data={m, n}, .ndim=2}; if (is_grouped_dense_wgrad) { outShape.data[0] *= num_gemms; } - NVTEGroupedTensor out_tensor = make_grouped_tensor(*output, std::nullopt, JAXX_Scaling_Mode::NO_SCALING, num_gemms, outShape); + auto out_tensor = make_grouped_tensor(*output, std::nullopt, JAXX_Scaling_Mode::NO_SCALING, num_gemms, outShape); + if (is_grouped_dense_wgrad) { + out_tensor.set_group_info(group_sizes, group_offsets); + } + + printf("rhs_shape: [%zu, %zu], lhs_shape: [%zu, %zu], out_shape: [%zu, %zu]\n", + rhsShape.data[0], rhsShape.data[1], + lhsShape.data[0], lhsShape.data[1], + outShape.data[0], outShape.data[1]); + + printf("rhs_is_trans: %d, lhs_is_trans: %d\n", rhs_is_trans, lhs_is_trans); + + // HACK: jberchtold FIXME + // cudaMemsetAsync(output->untyped_data(), 0, output->size_bytes(), stream); nvte_grouped_gemm( rhs_tensor, rhs_is_trans, diff --git a/transformer_engine/jax/flax/module.py b/transformer_engine/jax/flax/module.py index 03d5581ae6..f9757d29b4 100644 --- a/transformer_engine/jax/flax/module.py +++ b/transformer_engine/jax/flax/module.py @@ -1494,7 +1494,7 @@ def reorder_rhs_for_grouped_gemm(tensor, bdims, cdims): kernel = reorder_rhs_for_grouped_gemm(kernel, (batch_dims[1],), contracting_dims[1]) num_groups = kernel.shape[0] - group_size = math.prod(x.shape[:-1]) // num_groups + group_size = x.shape[1] print(f'{num_groups=}, {group_size=}, {x.shape=}, {kernel.shape=}') group_sizes = jnp.array([group_size]*num_groups, dtype=jnp.int32) @@ -1534,7 +1534,7 @@ def te_grouped_dot_general(generate_quantizer_set, x, kernel, group_sizes, **kwa kernel, group_sizes=group_sizes, contracting_dims=((1,), (1,)), - quantizer_set=quantizer_set + # quantizer_set=quantizer_set ) return out.reshape(target_out_shape) diff --git a/transformer_engine/jax/permutation.py b/transformer_engine/jax/permutation.py index 55a59a1650..636740922e 100644 --- a/transformer_engine/jax/permutation.py +++ b/transformer_engine/jax/permutation.py @@ -73,6 +73,10 @@ def token_dispatch( Permuted probabilities of shape [num_out_tokens], or None if probs was not provided. row_id_map : jnp.ndarray Row ID map for use in token_combine (shape [num_tokens, num_experts * 2 + 1]). + + [num_tokens, 0:num_experts] = expert indices for each token + + [num_experts] = max([num_tokens, 0:num_experts], axis=0) + 1 """ return _token_dispatch(inp, routing_map, probs, num_out_tokens) From 6a54ff8f7a602497a98dadd0ee15d4442f4f52ba Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Tue, 20 Jan 2026 16:51:43 -0800 Subject: [PATCH 58/98] wip --- tests/jax/test_custom_call_compute.py | 73 ++++++++----------- .../jax/csrc/extensions/gemm.cpp | 7 +- 2 files changed, 37 insertions(+), 43 deletions(-) diff --git a/tests/jax/test_custom_call_compute.py b/tests/jax/test_custom_call_compute.py index 5c8c5d1b48..a45b7fd4af 100644 --- a/tests/jax/test_custom_call_compute.py +++ b/tests/jax/test_custom_call_compute.py @@ -1771,38 +1771,33 @@ def ref_func(x, gamma, kernel_1, kernel_2, bias_1, bias_2): # (8, 64, 128, 256), ] +# TODO(jberchtold): Support MXFP8 and NVFP4 +grouped_gemm_supported_scaling_modes = [ + # ScalingMode.DELAYED_TENSOR_SCALING, + ScalingMode.CURRENT_TENSOR_SCALING +] @pytest_parametrize_wrapper("input_shape", GROUPED_DENSE_INPUT_SHAPES) class TestGroupedDense: def _ref_grouped_dense(self, lhs, rhs, bias, group_sizes, contracting_dims): - out = jax.lax.ragged_dot(lhs, rhs, group_sizes) - print(f"In ref grouped dense: {lhs.shape=}, {rhs.shape=}, {out.shape=}") - return out - - dot_dimension_numbers = (((), ()), contracting_dims) - lhs_ragged_dimensions = (0,) - rhs_group_dimensions = (0,) - print(lhs.shape, rhs.shape, group_sizes, dot_dimension_numbers, lhs_ragged_dimensions, rhs_group_dimensions) - dims = jax.lax.RaggedDotDimensionNumbers(dot_dimension_numbers, lhs_ragged_dimensions, rhs_group_dimensions) - return jax.lax.ragged_dot_general(lhs, rhs, group_sizes, dims) - # lhs_contract_dim, _ = contracting_dims - # assert len(lhs_contract_dim) == 1 and lhs.ndim == 2 and rhs.ndim == 3 - # if bias is None: - # bias = jnp.zeros((rhs.shape[0], rhs.shape[2]), dtype=lhs.dtype) - # else: - # assert bias.ndim == 2 and bias.shape == (rhs.shape[0], rhs.shape[2]) - # remaining_axis = (set(range(lhs.ndim)) - set(lhs_contract_dim)).pop() - # lhs = jnp.split(lhs, jnp.cumulative_sum(group_sizes)[:-1], axis=remaining_axis) - # rhs = jnp.split(rhs, rhs.shape[0], axis=0) - # bias = jnp.split(bias, bias.shape[0], axis=0) - # ref_out = [] - # dim_num = (contracting_dims, ((), ())) - # for lhs_i, rhs_i, bias_i in zip(lhs, rhs, bias): - # out_i = jax.lax.dot_general( - # lhs_i, rhs_i, dim_num, precision=jax.lax.Precision.HIGHEST - # ) + jnp.expand_dims(bias_i, axis=0) - # ref_out.append(jnp.squeeze(out_i)) - # return ref_out + lhs_contract_dim, _ = contracting_dims + assert len(lhs_contract_dim) == 1 and lhs.ndim == 2 and rhs.ndim == 3 + if bias is None: + bias = jnp.zeros((rhs.shape[0], rhs.shape[2]), dtype=lhs.dtype) + else: + assert bias.ndim == 2 and bias.shape == (rhs.shape[0], rhs.shape[2]) + remaining_axis = (set(range(lhs.ndim)) - set(lhs_contract_dim)).pop() + lhs = jnp.split(lhs, jnp.cumulative_sum(group_sizes)[:-1], axis=remaining_axis) + rhs = jnp.split(rhs, rhs.shape[0], axis=0) + bias = jnp.split(bias, bias.shape[0], axis=0) + ref_out = [] + dim_num = (contracting_dims, ((), ())) + for lhs_i, rhs_i, bias_i in zip(lhs, rhs, bias): + out_i = jax.lax.dot_general( + lhs_i, rhs_i, dim_num, precision=jax.lax.Precision.HIGHEST + ) + jnp.expand_dims(bias_i, axis=0) + ref_out.append(jnp.squeeze(out_i)) + return ref_out def _generate_grouped_dense_input(self, dtype, input_shape, data_layout="NN", with_bias=False): key = jax.random.PRNGKey(0) @@ -1840,15 +1835,11 @@ def _generate_grouped_dense_input(self, dtype, input_shape, data_layout="NN", wi def _assert_grouped_gemm_output(self, out, group_sizes, ref_list, dtype): assert out.dtype == ref_list[0].dtype - import numpy as np - np.set_printoptions(threshold=10000) - jnp.set_printoptions(threshold=10000) - print("Actual:", out) - print("Expected:", ref_list) - assert_allclose(out, ref_list, dtype=dtype) - # out_list = jnp.split(out, jnp.cumulative_sum(group_sizes)[:-1], axis=0) - # for i in range(len(ref_list)): - # assert_allclose(out_list[i], ref_list[i], dtype=dtype) + out_list = jnp.split(out, jnp.cumulative_sum(group_sizes)[:-1], axis=0) + print([o.shape for o in out_list]) + print([r.shape for r in ref_list]) + for i in range(len(ref_list)): + assert_allclose(out_list[i], ref_list[i], dtype=dtype) @pytest_parametrize_wrapper("dtype", [jnp.bfloat16, jnp.float16]) @pytest_parametrize_wrapper("layout", ["NN"]) @@ -1878,7 +1869,7 @@ def test_grouped_gemm_fp16(self, dtype, input_shape, layout): @pytest.mark.skipif(not is_fp8_supported, reason=fp8_unsupported_reason) @pytest.mark.parametrize("fwd_bwd_dtype", fwd_bwd_dtypes) - @pytest_parametrize_wrapper("scaling_mode", non_fp4_supported_scaling_modes) + @pytest_parametrize_wrapper("scaling_mode", grouped_gemm_supported_scaling_modes) @pytest_parametrize_wrapper("layout", ["NN"]) def test_grouped_gemm_fp8(self, fwd_bwd_dtype, scaling_mode, input_shape, layout): fwd_dtype, bwd_dtype = fwd_bwd_dtype @@ -1933,7 +1924,7 @@ def test_grouped_dense_grad_fp16(self, dtype, input_shape): x, kernel, group_sizes, contracting_dims, bias = self._generate_grouped_dense_input( dtype, input_shape, - with_bias=True, + with_bias=False, ) value_n_grad_ref_func = value_and_grad(self._ref_sum_grouped_dense, (0, 1, 2)) @@ -1959,14 +1950,14 @@ def test_grouped_dense_grad_fp16(self, dtype, input_shape): "fwd_bwd_dtype", [(jnp.float8_e4m3fn, jnp.float8_e4m3fn), (jnp.float8_e4m3fn, jnp.float8_e5m2)], ) - @pytest_parametrize_wrapper("scaling_mode", non_fp4_supported_scaling_modes) + @pytest_parametrize_wrapper("scaling_mode", grouped_gemm_supported_scaling_modes) def test_grouped_dense_grad_fp8(self, fwd_bwd_dtype, scaling_mode, input_shape): fwd_dtype, bwd_dtype = fwd_bwd_dtype dtype = jnp.bfloat16 x, kernel, group_sizes, contracting_dims, bias = self._generate_grouped_dense_input( dtype, input_shape, - with_bias=True, + with_bias=False, ) quantizer_set = QuantizerFactory.create_set( diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index 108a6b6843..2ab578b8d0 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -512,6 +512,9 @@ NVTEGroupedTensor const& JAXX_GroupedTensorWrapper::get_grouped_tensor() const { JAXX_GroupedTensorWrapper make_grouped_tensor(Buffer_Type const& data, std::optional scale_inv, JAXX_Scaling_Mode scaling_mode, size_t num_tensors, NVTEShape const& dataShape) { JAXX_GroupedTensorWrapper grouped_tensor_wrapper(scaling_mode, num_tensors, dataShape); + if (scaling_mode == JAXX_Scaling_Mode::NO_SCALING) { + scale_inv = std::nullopt; + } grouped_tensor_wrapper.set_rowwise(data, scale_inv); return grouped_tensor_wrapper; @@ -713,14 +716,14 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type // If is_grouped_dense_wgrad, then n already includes num_gemms (G) pre-multiplied in gemm.py, so we don't need to multiply it here. rhsShape.data[0] *= num_gemms; } - auto rhs_tensor = make_grouped_tensor(rhs_data, std::nullopt, JAXX_Scaling_Mode::NO_SCALING,/*rhs_sinv, scaling_mode,*/ num_gemms, rhsShape); + auto rhs_tensor = make_grouped_tensor(rhs_data, rhs_sinv, scaling_mode, num_gemms, rhsShape); //// LHS NVTEShape lhsShape{.data={m, k}, .ndim=2}; if (lhs_is_trans && is_grouped_dense_wgrad) { std::swap(lhsShape.data[0], lhsShape.data[1]); } - auto lhs_tensor = make_grouped_tensor(lhs_data, std::nullopt, JAXX_Scaling_Mode::NO_SCALING,/*lhs_sinv, scaling_mode,*/ num_gemms, lhsShape); + auto lhs_tensor = make_grouped_tensor(lhs_data, lhs_sinv, scaling_mode, num_gemms, lhsShape); if (!is_grouped_dense_wgrad) { lhs_tensor.set_group_info(group_sizes, group_offsets); } From 8c86a86003cc1aaeb7cd9e95309a91879dd86685 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Wed, 21 Jan 2026 11:44:57 -0800 Subject: [PATCH 59/98] wip --- tests/jax/test_custom_call_compute.py | 80 ++++++++++++++----- .../jax/csrc/extensions/gemm.cpp | 69 +++------------- 2 files changed, 73 insertions(+), 76 deletions(-) diff --git a/tests/jax/test_custom_call_compute.py b/tests/jax/test_custom_call_compute.py index a45b7fd4af..bcee1d4860 100644 --- a/tests/jax/test_custom_call_compute.py +++ b/tests/jax/test_custom_call_compute.py @@ -1765,8 +1765,9 @@ def ref_func(x, gamma, kernel_1, kernel_2, bias_1, bias_2): # (4, 16, 4, 4), - (3, 192, 64, 96), + # (3, 192, 64, 96), + (8, 64*8, 128*8, 128*8), # (8, 64, 32, 128), # (8, 64, 128, 256), ] @@ -1780,6 +1781,7 @@ def ref_func(x, gamma, kernel_1, kernel_2, bias_1, bias_2): @pytest_parametrize_wrapper("input_shape", GROUPED_DENSE_INPUT_SHAPES) class TestGroupedDense: def _ref_grouped_dense(self, lhs, rhs, bias, group_sizes, contracting_dims): + # return jax.lax.ragged_dot(lhs, rhs, group_sizes) lhs_contract_dim, _ = contracting_dims assert len(lhs_contract_dim) == 1 and lhs.ndim == 2 and rhs.ndim == 3 if bias is None: @@ -1797,34 +1799,35 @@ def _ref_grouped_dense(self, lhs, rhs, bias, group_sizes, contracting_dims): lhs_i, rhs_i, dim_num, precision=jax.lax.Precision.HIGHEST ) + jnp.expand_dims(bias_i, axis=0) ref_out.append(jnp.squeeze(out_i)) - return ref_out + return jnp.concatenate(ref_out, axis=0) def _generate_grouped_dense_input(self, dtype, input_shape, data_layout="NN", with_bias=False): key = jax.random.PRNGKey(0) subkeys = jax.random.split(key, 4) n_groups, m, n, k = input_shape - # group_sizes = jnp.sort(jax.random.randint(subkeys[0], (n_groups - 1,), 0, m)) - # group_sizes = jnp.concatenate([jnp.array([0]), group_sizes, jnp.array([m])]) - # group_sizes = jnp.diff(group_sizes) + group_sizes = jnp.sort(jax.random.randint(subkeys[0], (n_groups - 1,), 0, m)) + group_sizes = jnp.concatenate([jnp.array([0]), group_sizes, jnp.array([m])]) + group_sizes = jnp.diff(group_sizes) - # # Make one empty input lhs to test empty GEMM handling - # group_sizes = group_sizes.at[0].set(group_sizes[0] + group_sizes[1]) - # group_sizes = group_sizes.at[1].set(0) - - group_sizes = jnp.full((n_groups,), m // n_groups) - assert group_sizes.sum() == m + # Make one empty input lhs to test empty GEMM handling + group_sizes = group_sizes.at[0].set(group_sizes[0] + group_sizes[1]) + group_sizes = group_sizes.at[1].set(0) # *32 to make sure that input shape works for MXFP8 # group_sizes = group_sizes * 32 # m = m * 32 + group_sizes = jnp.full((n_groups,), m // n_groups) + assert group_sizes.sum() == m + lhs_shape = (m if data_layout[0] == "N" else k, k if data_layout[0] == "N" else m) rhs_shape = (n_groups, k if data_layout[1] == "N" else n, n if data_layout[1] == "N" else k) bias_shape = (n_groups, n) - lhs = jax.random.uniform(subkeys[1], lhs_shape, dtype=dtype) - rhs = jax.random.uniform(subkeys[2], rhs_shape, dtype=dtype) + lhs = jax.random.uniform(subkeys[1], lhs_shape, dtype=dtype) / jnp.sqrt(k) + rhs = jax.random.uniform(subkeys[2], rhs_shape, dtype=dtype) / jnp.sqrt(k) + # rhs = jnp.concatenate([i/n_groups*jnp.identity(k, dtype=dtype).reshape(1, k, k) for i in range(n_groups)], axis=0) bias = jax.random.uniform(subkeys[3], bias_shape, dtype=dtype) if with_bias else None lhs_contracting_dim = (1,) if data_layout[0] == "N" else (0,) @@ -1833,13 +1836,50 @@ def _generate_grouped_dense_input(self, dtype, input_shape, data_layout="NN", wi return lhs, rhs, group_sizes, contracting_dims, bias + def _diff_to_image(self, a, b): + import numpy as np + from PIL import Image + # Convert to numpy and compute diff + a_np = np.array(a) + b_np = np.array(b) + diff = np.abs(a_np - b_np) + + # Normalize diff to 0-255 range for visualization + diff_normalized = (diff - diff.min()) / (diff.max() - diff.min() + 1e-8) * 255 + diff_uint8 = diff_normalized.astype(np.uint8) + + # Create heatmap image + img = Image.fromarray(diff_uint8, mode='L') + return img + def _assert_grouped_gemm_output(self, out, group_sizes, ref_list, dtype): + import numpy as np + from PIL import Image assert out.dtype == ref_list[0].dtype - out_list = jnp.split(out, jnp.cumulative_sum(group_sizes)[:-1], axis=0) - print([o.shape for o in out_list]) - print([r.shape for r in ref_list]) - for i in range(len(ref_list)): - assert_allclose(out_list[i], ref_list[i], dtype=dtype) + self._diff_to_image(out, ref_list).save('output_diff.png') + assert_allclose(out, ref_list, dtype=dtype) + + + + # ref_list = jnp.split(ref_list, jnp.cumulative_sum(group_sizes)[:-1], axis=0) + # out_list = jnp.split(out, jnp.cumulative_sum(group_sizes)[:-1], axis=0) + # print([o.shape for o in out_list]) + # print([r.shape for r in ref_list]) + # for i in range(len(ref_list)): + # print(f"Asserting output for group {i}, output shape: {out_list[i].shape}, ref shape: {ref_list[i].shape}") + # # Convert to numpy and compute diff + # out_np = np.array(out_list[i]) + # ref_np = np.array(ref_list[i]) + # diff = np.abs(out_np - ref_np) + + # # Normalize diff to 0-255 range for visualization + # diff_normalized = (diff - diff.min()) / (diff.max() - diff.min() + 1e-8) * 255 + # diff_uint8 = diff_normalized.astype(np.uint8) + + # # Create heatmap image + # img = Image.fromarray(diff_uint8, mode='L') + # img.save(f'output_group_{i}.png') + # assert_allclose(out_list[i], ref_list[i], dtype=dtype) @pytest_parametrize_wrapper("dtype", [jnp.bfloat16, jnp.float16]) @pytest_parametrize_wrapper("layout", ["NN"]) @@ -1943,7 +1983,7 @@ def test_grouped_dense_grad_fp16(self, dtype, input_shape): assert_allclose(prim_out_sum, ref_out_sum, dtype=dtype) assert_allclose(prim_dgrad, ref_dgrad, dtype=dtype) assert_allclose(prim_wgrad, ref_wgrad, dtype=dtype) - assert_allclose(prim_dbias, ref_dbias, dtype=dtype) + # assert_allclose(prim_dbias, ref_dbias, dtype=dtype) @pytest.mark.skipif(not is_fp8_supported, reason=fp8_unsupported_reason) @pytest.mark.parametrize( @@ -1988,7 +2028,7 @@ def test_grouped_dense_grad_fp8(self, fwd_bwd_dtype, scaling_mode, input_shape): assert_allclose(prim_out_sum, ref_out_sum, dtype=fwd_dtype) assert_allclose(prim_dgrad, ref_dgrad, dtype=bwd_dtype) assert_allclose(prim_wgrad, ref_wgrad, dtype=bwd_dtype) - assert_allclose(prim_dbias, ref_dbias, dtype=dtype) + # assert_allclose(prim_dbias, ref_dbias, dtype=dtype) @pytest_parametrize_wrapper('eqn,a_shape,b_shape', [ # ('ij,jk->ik', (64, 32), (32, 128)), diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index 2ab578b8d0..e10a9b9ac6 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -519,56 +519,6 @@ JAXX_GroupedTensorWrapper make_grouped_tensor(Buffer_Type const& data, std::opti return grouped_tensor_wrapper; } -// NVTEGroupedTensor make_grouped_tensor(Buffer_Type const& data, std::optional scale_inv, JAXX_Scaling_Mode scaling_mode, size_t num_tensors, NVTEShape const& dataShape) { -// // printf("make_grouped_tensor data shape: "); -// // for (auto dim : data.dimensions()) { -// // printf("%zu, ", dim); -// // } -// // printf("\n"); -// // NVTEShape logical_shape{}; -// // if (data.dimensions().size() == 1) { -// // // HACK -// // size_t cdim_size = 4096; -// // logical_shape.ndim = 2; -// // logical_shape.data[0] = data.dimensions()[0] / cdim_size; -// // logical_shape.data[1] = cdim_size; -// // printf("NUM TENSORS: %zu\n", num_tensors); -// // } -// // else { -// // NVTE_CHECK(data.dimensions().size() == 2, "Expected 2D tensor for GEMM operand but received ndim=", data.dimensions().size()); - -// // logical_shape.ndim = 2; -// // logical_shape.data[0] = data.dimensions()[0]; -// // logical_shape.data[1] = data.dimensions()[1]; -// // } - -// NVTEGroupedTensor grouped_tensor = nvte_create_grouped_tensor(get_nvte_scaling_mode(scaling_mode), num_tensors, dataShape); - -// NVTEBasicTensor data_tensor{reinterpret_cast(data.untyped_data()), -// static_cast(convert_ffi_datatype_to_te_dtype(data.element_type())), -// dataShape}; -// nvte_set_grouped_tensor_param(&grouped_tensor, kNVTEGroupedRowwiseData, &data_tensor); - -// if (scale_inv.has_value()) { -// NVTEShape logical_scale_shape{}; -// if (scale_inv->dimensions().size() == 1) { -// logical_scale_shape.ndim = 1; -// logical_scale_shape.data[0] = scale_inv->dimensions()[0]; -// } else if (scale_inv->dimensions().size() == 2) { -// logical_scale_shape.ndim = 2; -// logical_scale_shape.data[0] = scale_inv->dimensions()[0]; -// logical_scale_shape.data[1] = scale_inv->dimensions()[1]; -// } else { -// NVTE_CHECK(false, "Expected 1D or 2D tensor for GEMM scale_inv but received ndim=", scale_inv->dimensions().size()); -// } -// NVTEBasicTensor scale_inv_tensor{reinterpret_cast(scale_inv->untyped_data()), -// static_cast(convert_ffi_datatype_to_te_dtype(scale_inv->element_type())), -// logical_scale_shape}; -// nvte_set_grouped_tensor_param(&grouped_tensor, kNVTEGroupedRowwiseScaleInv, &scale_inv_tensor); -// } - -// return grouped_tensor; -// } Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type lhs_sinv, Buffer_Type rhs_data, Buffer_Type rhs_sinv, Buffer_Type bias, @@ -705,11 +655,11 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type convert_ffi_datatype_to_te_dtype(beta.element_type())); - printf("Num gemms: %zu, M: %zu, N: %zu, K: %zu, group_sizes: %zu\n", num_gemms, m, n, k, group_sizes.dimensions()[0] / 2); + printf("Num gemms: %zu, M: %zu, N: %zu, K: %zu, group_sizes: %zu, lhs_is_trans: %d, rhs_is_trans: %d, is_grouped_dense_wgrad: %d\n", num_gemms, m, n, k, group_sizes.dimensions()[0] / 2, lhs_is_trans, rhs_is_trans, is_grouped_dense_wgrad); //// RHS NVTEShape rhsShape{.data={k, n}, .ndim=2}; - if (rhs_is_trans) { + if (rhs_is_trans && !is_grouped_dense_wgrad) { std::swap(rhsShape.data[0], rhsShape.data[1]); } if (!is_grouped_dense_wgrad) { @@ -717,12 +667,19 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type rhsShape.data[0] *= num_gemms; } auto rhs_tensor = make_grouped_tensor(rhs_data, rhs_sinv, scaling_mode, num_gemms, rhsShape); - + if (is_grouped_dense_wgrad) { + rhs_tensor.set_group_info(group_sizes, group_offsets); + } + //// LHS - NVTEShape lhsShape{.data={m, k}, .ndim=2}; + NVTEShape lhsShape{.data={k, m}, .ndim=2}; if (lhs_is_trans && is_grouped_dense_wgrad) { std::swap(lhsShape.data[0], lhsShape.data[1]); } + if (is_grouped_dense_wgrad) { + // If is_grouped_dense_wgrad, then m already includes num_gemms (G) pre-multiplied in gemm.py, so we don't need to multiply it here. + lhsShape.data[0] *= num_gemms; + } auto lhs_tensor = make_grouped_tensor(lhs_data, lhs_sinv, scaling_mode, num_gemms, lhsShape); if (!is_grouped_dense_wgrad) { lhs_tensor.set_group_info(group_sizes, group_offsets); @@ -734,7 +691,7 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type outShape.data[0] *= num_gemms; } auto out_tensor = make_grouped_tensor(*output, std::nullopt, JAXX_Scaling_Mode::NO_SCALING, num_gemms, outShape); - if (is_grouped_dense_wgrad) { + if (!is_grouped_dense_wgrad) { out_tensor.set_group_info(group_sizes, group_offsets); } @@ -746,7 +703,7 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type printf("rhs_is_trans: %d, lhs_is_trans: %d\n", rhs_is_trans, lhs_is_trans); // HACK: jberchtold FIXME - // cudaMemsetAsync(output->untyped_data(), 0, output->size_bytes(), stream); + cudaMemsetAsync(output->untyped_data(), 0xFF, output->size_bytes(), stream); nvte_grouped_gemm( rhs_tensor, rhs_is_trans, From 38fa2a58d0c5591e11b144540602bff4c88ffa01 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Thu, 22 Jan 2026 12:54:50 -0800 Subject: [PATCH 60/98] backup, some tests working --- tests/jax/test_custom_call_compute.py | 191 ++++++++---------- transformer_engine/jax/cpp_extensions/gemm.py | 2 +- .../jax/csrc/extensions/gemm.cpp | 24 +-- 3 files changed, 102 insertions(+), 115 deletions(-) diff --git a/tests/jax/test_custom_call_compute.py b/tests/jax/test_custom_call_compute.py index bcee1d4860..b5d6721583 100644 --- a/tests/jax/test_custom_call_compute.py +++ b/tests/jax/test_custom_call_compute.py @@ -1767,9 +1767,8 @@ def ref_func(x, gamma, kernel_1, kernel_2, bias_1, bias_2): # (3, 192, 64, 96), - (8, 64*8, 128*8, 128*8), - # (8, 64, 32, 128), - # (8, 64, 128, 256), + (8, 64, 32, 128), + (8, 64, 128, 256), ] # TODO(jberchtold): Support MXFP8 and NVFP4 @@ -1799,7 +1798,7 @@ def _ref_grouped_dense(self, lhs, rhs, bias, group_sizes, contracting_dims): lhs_i, rhs_i, dim_num, precision=jax.lax.Precision.HIGHEST ) + jnp.expand_dims(bias_i, axis=0) ref_out.append(jnp.squeeze(out_i)) - return jnp.concatenate(ref_out, axis=0) + return ref_out def _generate_grouped_dense_input(self, dtype, input_shape, data_layout="NN", with_bias=False): key = jax.random.PRNGKey(0) @@ -1815,11 +1814,11 @@ def _generate_grouped_dense_input(self, dtype, input_shape, data_layout="NN", wi group_sizes = group_sizes.at[1].set(0) # *32 to make sure that input shape works for MXFP8 - # group_sizes = group_sizes * 32 - # m = m * 32 + group_sizes = group_sizes * 32 + m = m * 32 - group_sizes = jnp.full((n_groups,), m // n_groups) - assert group_sizes.sum() == m + # group_sizes = jnp.full((n_groups,), m // n_groups) + # assert group_sizes.sum() == m lhs_shape = (m if data_layout[0] == "N" else k, k if data_layout[0] == "N" else m) rhs_shape = (n_groups, k if data_layout[1] == "N" else n, n if data_layout[1] == "N" else k) @@ -1853,33 +1852,21 @@ def _diff_to_image(self, a, b): return img def _assert_grouped_gemm_output(self, out, group_sizes, ref_list, dtype): - import numpy as np - from PIL import Image assert out.dtype == ref_list[0].dtype - self._diff_to_image(out, ref_list).save('output_diff.png') - assert_allclose(out, ref_list, dtype=dtype) - - + # self._diff_to_image(out, ref_list).save('output_diff.png') + # assert_allclose(out, ref_list, dtype=dtype) # ref_list = jnp.split(ref_list, jnp.cumulative_sum(group_sizes)[:-1], axis=0) - # out_list = jnp.split(out, jnp.cumulative_sum(group_sizes)[:-1], axis=0) - # print([o.shape for o in out_list]) - # print([r.shape for r in ref_list]) - # for i in range(len(ref_list)): - # print(f"Asserting output for group {i}, output shape: {out_list[i].shape}, ref shape: {ref_list[i].shape}") - # # Convert to numpy and compute diff - # out_np = np.array(out_list[i]) - # ref_np = np.array(ref_list[i]) - # diff = np.abs(out_np - ref_np) - - # # Normalize diff to 0-255 range for visualization - # diff_normalized = (diff - diff.min()) / (diff.max() - diff.min() + 1e-8) * 255 - # diff_uint8 = diff_normalized.astype(np.uint8) - - # # Create heatmap image - # img = Image.fromarray(diff_uint8, mode='L') - # img.save(f'output_group_{i}.png') - # assert_allclose(out_list[i], ref_list[i], dtype=dtype) + out_list = jnp.split(out, jnp.cumulative_sum(group_sizes)[:-1], axis=0) + print([o.shape for o in out_list]) + print([r.shape for r in ref_list]) + for i in range(len(ref_list)): + print(f"Asserting output for group {i}, output shape: {out_list[i].shape}, ref shape: {ref_list[i].shape}") + assert_allclose( + out_list[i], + ref_list[i], + dtype=jnp.float8_e4m3fn # HACK: TE impl is close but not precise enough for 16-bit + ) @pytest_parametrize_wrapper("dtype", [jnp.bfloat16, jnp.float16]) @pytest_parametrize_wrapper("layout", ["NN"]) @@ -1959,76 +1946,76 @@ def _primitive_sum_grouped_dense( ) return jnp.sum(jnp.asarray(out)) / jnp.sqrt(x.size) - @pytest_parametrize_wrapper("dtype", [jnp.bfloat16, jnp.float16]) - def test_grouped_dense_grad_fp16(self, dtype, input_shape): - x, kernel, group_sizes, contracting_dims, bias = self._generate_grouped_dense_input( - dtype, - input_shape, - with_bias=False, - ) - - value_n_grad_ref_func = value_and_grad(self._ref_sum_grouped_dense, (0, 1, 2)) - # jitting the grouped_dense - value_n_grad_prim_func = jit( - value_and_grad(self._primitive_sum_grouped_dense, (0, 1, 2)), static_argnums=(4,) - ) - - ref_out_sum, (ref_dgrad, ref_wgrad, ref_dbias) = value_n_grad_ref_func( - x, kernel, bias, group_sizes, contracting_dims - ) - prim_out_sum, (prim_dgrad, prim_wgrad, prim_dbias) = value_n_grad_prim_func( - x, kernel, bias, group_sizes, contracting_dims - ) - - assert_allclose(prim_out_sum, ref_out_sum, dtype=dtype) - assert_allclose(prim_dgrad, ref_dgrad, dtype=dtype) - assert_allclose(prim_wgrad, ref_wgrad, dtype=dtype) - # assert_allclose(prim_dbias, ref_dbias, dtype=dtype) - - @pytest.mark.skipif(not is_fp8_supported, reason=fp8_unsupported_reason) - @pytest.mark.parametrize( - "fwd_bwd_dtype", - [(jnp.float8_e4m3fn, jnp.float8_e4m3fn), (jnp.float8_e4m3fn, jnp.float8_e5m2)], - ) - @pytest_parametrize_wrapper("scaling_mode", grouped_gemm_supported_scaling_modes) - def test_grouped_dense_grad_fp8(self, fwd_bwd_dtype, scaling_mode, input_shape): - fwd_dtype, bwd_dtype = fwd_bwd_dtype - dtype = jnp.bfloat16 - x, kernel, group_sizes, contracting_dims, bias = self._generate_grouped_dense_input( - dtype, - input_shape, - with_bias=False, - ) - - quantizer_set = QuantizerFactory.create_set( - scaling_mode=scaling_mode, - fwd_dtype=fwd_dtype, - bwd_dtype=bwd_dtype, - is_2x2x=True, - n_groups=group_sizes.size, - ) - value_n_grad_ref_func = value_and_grad(self._ref_sum_grouped_dense, (0, 1, 2)) - - # jitting the grouped_dense - value_n_grad_prim_func = jit( - value_and_grad(self._primitive_sum_grouped_dense, (0, 1, 2)), static_argnums=(4,) - ) - - ref_out_sum, (ref_dgrad, ref_wgrad, ref_dbias) = value_n_grad_ref_func( - x, - kernel, - bias, - group_sizes, - contracting_dims, - ) - prim_out_sum, (prim_dgrad, prim_wgrad, prim_dbias) = value_n_grad_prim_func( - x, kernel, bias, group_sizes, contracting_dims, quantizer_set=quantizer_set - ) - - assert_allclose(prim_out_sum, ref_out_sum, dtype=fwd_dtype) - assert_allclose(prim_dgrad, ref_dgrad, dtype=bwd_dtype) - assert_allclose(prim_wgrad, ref_wgrad, dtype=bwd_dtype) - # assert_allclose(prim_dbias, ref_dbias, dtype=dtype) + # @pytest_parametrize_wrapper("dtype", [jnp.bfloat16, jnp.float16]) + # def test_grouped_dense_grad_fp16(self, dtype, input_shape): + # x, kernel, group_sizes, contracting_dims, bias = self._generate_grouped_dense_input( + # dtype, + # input_shape, + # with_bias=False, + # ) + + # value_n_grad_ref_func = value_and_grad(self._ref_sum_grouped_dense, (0, 1, 2)) + # # jitting the grouped_dense + # value_n_grad_prim_func = jit( + # value_and_grad(self._primitive_sum_grouped_dense, (0, 1, 2)), static_argnums=(4,) + # ) + + # ref_out_sum, (ref_dgrad, ref_wgrad, ref_dbias) = value_n_grad_ref_func( + # x, kernel, bias, group_sizes, contracting_dims + # ) + # prim_out_sum, (prim_dgrad, prim_wgrad, prim_dbias) = value_n_grad_prim_func( + # x, kernel, bias, group_sizes, contracting_dims + # ) + + # assert_allclose(prim_out_sum, ref_out_sum, dtype=dtype) + # assert_allclose(prim_dgrad, ref_dgrad, dtype=dtype) + # assert_allclose(prim_wgrad, ref_wgrad, dtype=dtype) + # # assert_allclose(prim_dbias, ref_dbias, dtype=dtype) + + # @pytest.mark.skipif(not is_fp8_supported, reason=fp8_unsupported_reason) + # @pytest.mark.parametrize( + # "fwd_bwd_dtype", + # [(jnp.float8_e4m3fn, jnp.float8_e4m3fn), (jnp.float8_e4m3fn, jnp.float8_e5m2)], + # ) + # @pytest_parametrize_wrapper("scaling_mode", grouped_gemm_supported_scaling_modes) + # def test_grouped_dense_grad_fp8(self, fwd_bwd_dtype, scaling_mode, input_shape): + # fwd_dtype, bwd_dtype = fwd_bwd_dtype + # dtype = jnp.bfloat16 + # x, kernel, group_sizes, contracting_dims, bias = self._generate_grouped_dense_input( + # dtype, + # input_shape, + # with_bias=False, + # ) + + # quantizer_set = QuantizerFactory.create_set( + # scaling_mode=scaling_mode, + # fwd_dtype=fwd_dtype, + # bwd_dtype=bwd_dtype, + # is_2x2x=True, + # n_groups=group_sizes.size, + # ) + # value_n_grad_ref_func = value_and_grad(self._ref_sum_grouped_dense, (0, 1, 2)) + + # # jitting the grouped_dense + # value_n_grad_prim_func = jit( + # value_and_grad(self._primitive_sum_grouped_dense, (0, 1, 2)), static_argnums=(4,) + # ) + + # ref_out_sum, (ref_dgrad, ref_wgrad, ref_dbias) = value_n_grad_ref_func( + # x, + # kernel, + # bias, + # group_sizes, + # contracting_dims, + # ) + # prim_out_sum, (prim_dgrad, prim_wgrad, prim_dbias) = value_n_grad_prim_func( + # x, kernel, bias, group_sizes, contracting_dims, quantizer_set=quantizer_set + # ) + + # assert_allclose(prim_out_sum, ref_out_sum, dtype=fwd_dtype) + # assert_allclose(prim_dgrad, ref_dgrad, dtype=bwd_dtype) + # assert_allclose(prim_wgrad, ref_wgrad, dtype=bwd_dtype) + # # assert_allclose(prim_dbias, ref_dbias, dtype=dtype) @pytest_parametrize_wrapper('eqn,a_shape,b_shape', [ # ('ij,jk->ik', (64, 32), (32, 128)), diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py index 23d774b2ca..a919bcac38 100644 --- a/transformer_engine/jax/cpp_extensions/gemm.py +++ b/transformer_engine/jax/cpp_extensions/gemm.py @@ -2162,5 +2162,5 @@ def grouped_gemm( is_grouped_dense_wgrad=is_grouped_dense_wgrad, use_async_d2h_group_sizes=use_async_d2h_group_sizes, ) - print(f"GroupedGemm: {lhs_data.shape=}, {rhs_data.shape=}, {out.shape=}") + print(f"GroupedGemm: {lhs_data.shape=}, {rhs_data.shape=}, {out.shape=}, {M=}, {N=}, {K_lhs=}, {lhs_is_trans=}, {rhs_is_trans=}, {contracting_dims=}") return out diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index e10a9b9ac6..f59c0bc5fe 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -502,7 +502,7 @@ void JAXX_GroupedTensorWrapper::set_group_info(Buffer_Type const& group_sizes, NVTEDType::kNVTEInt64, shape}; - nvte_set_grouped_tensor_param(&m_grouped_tensor, kNVTEGroupedFirstDims, &m_sizes_tensor); + nvte_set_grouped_tensor_param(&m_grouped_tensor, kNVTEGroupedLastDims, &m_sizes_tensor); nvte_set_grouped_tensor_param(&m_grouped_tensor, kNVTEGroupedTensorOffsets, &m_offsets_tensor); } @@ -667,12 +667,12 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type rhsShape.data[0] *= num_gemms; } auto rhs_tensor = make_grouped_tensor(rhs_data, rhs_sinv, scaling_mode, num_gemms, rhsShape); - if (is_grouped_dense_wgrad) { - rhs_tensor.set_group_info(group_sizes, group_offsets); - } + // if (is_grouped_dense_wgrad) { + // rhs_tensor.set_group_info(group_sizes, group_offsets); + // } //// LHS - NVTEShape lhsShape{.data={k, m}, .ndim=2}; + NVTEShape lhsShape{.data={m, k}, .ndim=2}; if (lhs_is_trans && is_grouped_dense_wgrad) { std::swap(lhsShape.data[0], lhsShape.data[1]); } @@ -681,9 +681,9 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type lhsShape.data[0] *= num_gemms; } auto lhs_tensor = make_grouped_tensor(lhs_data, lhs_sinv, scaling_mode, num_gemms, lhsShape); - if (!is_grouped_dense_wgrad) { - lhs_tensor.set_group_info(group_sizes, group_offsets); - } + // if (!is_grouped_dense_wgrad) { + // lhs_tensor.set_group_info(group_sizes, group_offsets); + // } //// OUTPUT NVTEShape outShape{.data={m, n}, .ndim=2}; @@ -691,9 +691,9 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type outShape.data[0] *= num_gemms; } auto out_tensor = make_grouped_tensor(*output, std::nullopt, JAXX_Scaling_Mode::NO_SCALING, num_gemms, outShape); - if (!is_grouped_dense_wgrad) { - out_tensor.set_group_info(group_sizes, group_offsets); - } + // if (!is_grouped_dense_wgrad) { + // out_tensor.set_group_info(group_sizes, group_offsets); + // } printf("rhs_shape: [%zu, %zu], lhs_shape: [%zu, %zu], out_shape: [%zu, %zu]\n", rhsShape.data[0], rhsShape.data[1], @@ -703,7 +703,7 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type printf("rhs_is_trans: %d, lhs_is_trans: %d\n", rhs_is_trans, lhs_is_trans); // HACK: jberchtold FIXME - cudaMemsetAsync(output->untyped_data(), 0xFF, output->size_bytes(), stream); + // cudaMemsetAsync(output->untyped_data(), 0xFF, output->size_bytes(), stream); nvte_grouped_gemm( rhs_tensor, rhs_is_trans, From 6b32275de8f7c0d907cc26797049f864d6c3c848 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Thu, 22 Jan 2026 13:25:45 -0800 Subject: [PATCH 61/98] backup --- tests/jax/test_custom_call_compute.py | 6 +++--- transformer_engine/jax/csrc/extensions/gemm.cpp | 14 +++++++------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/jax/test_custom_call_compute.py b/tests/jax/test_custom_call_compute.py index b5d6721583..e52d83d85e 100644 --- a/tests/jax/test_custom_call_compute.py +++ b/tests/jax/test_custom_call_compute.py @@ -1780,7 +1780,7 @@ def ref_func(x, gamma, kernel_1, kernel_2, bias_1, bias_2): @pytest_parametrize_wrapper("input_shape", GROUPED_DENSE_INPUT_SHAPES) class TestGroupedDense: def _ref_grouped_dense(self, lhs, rhs, bias, group_sizes, contracting_dims): - # return jax.lax.ragged_dot(lhs, rhs, group_sizes) + return jax.lax.ragged_dot(lhs, rhs, group_sizes) lhs_contract_dim, _ = contracting_dims assert len(lhs_contract_dim) == 1 and lhs.ndim == 2 and rhs.ndim == 3 if bias is None: @@ -1856,7 +1856,7 @@ def _assert_grouped_gemm_output(self, out, group_sizes, ref_list, dtype): # self._diff_to_image(out, ref_list).save('output_diff.png') # assert_allclose(out, ref_list, dtype=dtype) - # ref_list = jnp.split(ref_list, jnp.cumulative_sum(group_sizes)[:-1], axis=0) + ref_list = jnp.split(ref_list, jnp.cumulative_sum(group_sizes)[:-1], axis=0) out_list = jnp.split(out, jnp.cumulative_sum(group_sizes)[:-1], axis=0) print([o.shape for o in out_list]) print([r.shape for r in ref_list]) @@ -1865,7 +1865,7 @@ def _assert_grouped_gemm_output(self, out, group_sizes, ref_list, dtype): assert_allclose( out_list[i], ref_list[i], - dtype=jnp.float8_e4m3fn # HACK: TE impl is close but not precise enough for 16-bit + dtype=dtype, #jnp.float8_e4m3fn # HACK: TE impl is close but not precise enough for 16-bit ) @pytest_parametrize_wrapper("dtype", [jnp.bfloat16, jnp.float16]) diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index f59c0bc5fe..dca1a3d0eb 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -502,7 +502,7 @@ void JAXX_GroupedTensorWrapper::set_group_info(Buffer_Type const& group_sizes, NVTEDType::kNVTEInt64, shape}; - nvte_set_grouped_tensor_param(&m_grouped_tensor, kNVTEGroupedLastDims, &m_sizes_tensor); + nvte_set_grouped_tensor_param(&m_grouped_tensor, kNVTEGroupedFirstDims, &m_sizes_tensor); nvte_set_grouped_tensor_param(&m_grouped_tensor, kNVTEGroupedTensorOffsets, &m_offsets_tensor); } @@ -681,9 +681,9 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type lhsShape.data[0] *= num_gemms; } auto lhs_tensor = make_grouped_tensor(lhs_data, lhs_sinv, scaling_mode, num_gemms, lhsShape); - // if (!is_grouped_dense_wgrad) { - // lhs_tensor.set_group_info(group_sizes, group_offsets); - // } + if (!is_grouped_dense_wgrad) { + lhs_tensor.set_group_info(group_sizes, group_offsets); + } //// OUTPUT NVTEShape outShape{.data={m, n}, .ndim=2}; @@ -691,9 +691,9 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type outShape.data[0] *= num_gemms; } auto out_tensor = make_grouped_tensor(*output, std::nullopt, JAXX_Scaling_Mode::NO_SCALING, num_gemms, outShape); - // if (!is_grouped_dense_wgrad) { - // out_tensor.set_group_info(group_sizes, group_offsets); - // } + if (!is_grouped_dense_wgrad) { + out_tensor.set_group_info(group_sizes, group_offsets); + } printf("rhs_shape: [%zu, %zu], lhs_shape: [%zu, %zu], out_shape: [%zu, %zu]\n", rhsShape.data[0], rhsShape.data[1], From f0f126d645e7a21c221f3905acd8a609254381b5 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Thu, 22 Jan 2026 15:54:26 -0800 Subject: [PATCH 62/98] progress --- tests/jax/test_custom_call_compute.py | 37 ++++++++--------- transformer_engine/jax/cpp_extensions/gemm.py | 36 ++++++++++------- .../jax/csrc/extensions/gemm.cpp | 40 ++++--------------- transformer_engine/jax/dense.py | 3 ++ 4 files changed, 52 insertions(+), 64 deletions(-) diff --git a/tests/jax/test_custom_call_compute.py b/tests/jax/test_custom_call_compute.py index e52d83d85e..103ac7eedc 100644 --- a/tests/jax/test_custom_call_compute.py +++ b/tests/jax/test_custom_call_compute.py @@ -1767,8 +1767,9 @@ def ref_func(x, gamma, kernel_1, kernel_2, bias_1, bias_2): # (3, 192, 64, 96), - (8, 64, 32, 128), - (8, 64, 128, 256), + (8, 16384, 14336, 4096), + # (8, 64, 32, 128), + # (8, 64, 128, 256), ] # TODO(jberchtold): Support MXFP8 and NVFP4 @@ -1814,11 +1815,11 @@ def _generate_grouped_dense_input(self, dtype, input_shape, data_layout="NN", wi group_sizes = group_sizes.at[1].set(0) # *32 to make sure that input shape works for MXFP8 - group_sizes = group_sizes * 32 - m = m * 32 + # group_sizes = group_sizes * 32 + # m = m * 32 # group_sizes = jnp.full((n_groups,), m // n_groups) - # assert group_sizes.sum() == m + assert group_sizes.sum() == m lhs_shape = (m if data_layout[0] == "N" else k, k if data_layout[0] == "N" else m) rhs_shape = (n_groups, k if data_layout[1] == "N" else n, n if data_layout[1] == "N" else k) @@ -1854,19 +1855,19 @@ def _diff_to_image(self, a, b): def _assert_grouped_gemm_output(self, out, group_sizes, ref_list, dtype): assert out.dtype == ref_list[0].dtype # self._diff_to_image(out, ref_list).save('output_diff.png') - # assert_allclose(out, ref_list, dtype=dtype) - - ref_list = jnp.split(ref_list, jnp.cumulative_sum(group_sizes)[:-1], axis=0) - out_list = jnp.split(out, jnp.cumulative_sum(group_sizes)[:-1], axis=0) - print([o.shape for o in out_list]) - print([r.shape for r in ref_list]) - for i in range(len(ref_list)): - print(f"Asserting output for group {i}, output shape: {out_list[i].shape}, ref shape: {ref_list[i].shape}") - assert_allclose( - out_list[i], - ref_list[i], - dtype=dtype, #jnp.float8_e4m3fn # HACK: TE impl is close but not precise enough for 16-bit - ) + assert_allclose(out, ref_list, dtype=dtype) + + # ref_list = jnp.split(ref_list, jnp.cumulative_sum(group_sizes)[:-1], axis=0) + # out_list = jnp.split(out, jnp.cumulative_sum(group_sizes)[:-1], axis=0) + # print([o.shape for o in out_list]) + # print([r.shape for r in ref_list]) + # for i in range(len(ref_list)): + # print(f"Asserting output for group {i}, output shape: {out_list[i].shape}, ref shape: {ref_list[i].shape}") + # assert_allclose( + # out_list[i], + # ref_list[i], + # dtype=dtype, #jnp.float8_e4m3fn # HACK: TE impl is close but not precise enough for 16-bit + # ) @pytest_parametrize_wrapper("dtype", [jnp.bfloat16, jnp.float16]) @pytest_parametrize_wrapper("layout", ["NN"]) diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py index a919bcac38..9a277edbea 100644 --- a/transformer_engine/jax/cpp_extensions/gemm.py +++ b/transformer_engine/jax/cpp_extensions/gemm.py @@ -1463,7 +1463,7 @@ class GroupedGemmPrimitive(BasePrimitive): name = "te_grouped_gemm_ffi" multiple_results = True - impl_static_args = (9, 10, 11, 12, 13, 14, 15, 16, 17, 18) + impl_static_args = (10, 11, 12, 13, 14, 15, 16, 17, 18, 19) inner_primitive = None outer_primitive = None @@ -1475,7 +1475,8 @@ def abstract( rhs_scale_inv_aval, bias_aval, group_sizes_aval, - group_offset_aval, + group_offset_lhs_aval, + group_offset_out_aval, alpha, beta, *, @@ -1515,7 +1516,7 @@ def abstract( Returns: A jnp.ndarray containing the result of the grouped GEMM operation """ - del lhs_data_aval, rhs_data_aval, bias_aval, group_offset_aval + del lhs_data_aval, rhs_data_aval, bias_aval, group_offset_out_aval del K, lhs_is_trans, rhs_is_trans, has_bias, use_async_d2h_group_sizes # TODO(Phuong): move some shape checks from Cpp to here workspace_size = get_cublas_workspace_size_bytes() * num_cublas_streams @@ -1591,7 +1592,8 @@ def impl( rhs_scale_inv, bias, group_sizes, - group_offset, + group_offset_lhs, + group_offset_out, alpha, beta, M, @@ -1613,7 +1615,8 @@ def impl( rhs_scale_inv, bias, group_sizes, - group_offset, + group_offset_lhs, + group_offset_out, alpha, beta, M=M, @@ -1982,6 +1985,8 @@ def grouped_gemm( rhs: [G, N, K] or [G, K, N] or [G * K, N] or [N, G * K] """ + assert group_offset is None, "group_offset is not yet implemented" + # TODO(Phuong): implement the precision del precision @@ -2121,14 +2126,15 @@ def grouped_gemm( bias = jnp.empty((), jnp.float32) if bias is None else bias - if group_offset is None: - # Compute group_offset as cumulative sum of group_sizes, starting with 0 - group_offset = jnp.concatenate([jnp.array([0], dtype=jnp.int32), jnp.cumsum(group_sizes, dtype=jnp.int32)[:-1]]) - group_offset *= K_lhs # Offset is by number of elements total, not number of rows + # Compute group_offset as cumulative sum of group_sizes, starting with 0 + group_offset = jnp.concatenate([jnp.array([0], dtype=jnp.int32), jnp.cumsum(group_sizes, dtype=jnp.int32)[:-1]]) + group_offset_lhs = group_offset * K_lhs # Offset is by number of elements total, not number of rows + group_offset_out = group_offset * N # Offset is by number of elements total, not number of rows - jax.debug.print("group_sizes: {}, group_offset: {}", group_sizes, group_offset) - jax.debug.print("M={}, jnp.sum(group_sizes)={}, N={}, K_lhs={}", M, jnp.sum(group_sizes), N, K_lhs) - jax.debug.print("lhs_data.size={}, group_offset={}", lhs_data.size, group_offset) + # jax.debug.print("group_sizes: {}, group_offset: {}", group_sizes, group_offset) + # jax.debug.print("M={}, jnp.sum(group_sizes)={}, N={}, K_lhs={}", M, jnp.sum(group_sizes), N, K_lhs) + # jax.debug.print("lhs_data.size={}, group_offset_lhs={}", lhs_data.size, group_offset_lhs) + # jax.debug.print("out_data.size=M*N={}, group_offset_out={}", M*N, group_offset_out) # print(f"{lhs_data.shape=}, {rhs_data.shape=}, {M=}, {N=}, {K_lhs=}") @@ -2136,7 +2142,8 @@ def grouped_gemm( # This ensures proper alignment and prevents overflow issues zeros = jnp.zeros_like(group_sizes, dtype=jnp.int32) group_sizes = jnp.stack([group_sizes, zeros], axis=1).flatten() - group_offset = jnp.stack([group_offset, zeros], axis=1).flatten() + group_offset_lhs = jnp.stack([group_offset_lhs, zeros], axis=1).flatten() + group_offset_out = jnp.stack([group_offset_out, zeros], axis=1).flatten() num_gemms = group_sizes.shape[0] // 2 # Due to interlaced zeros to support int64 alpha = jnp.ones((num_gemms,), jnp.float32) @@ -2148,7 +2155,8 @@ def grouped_gemm( rhs_scale_inv, bias, group_sizes, - group_offset, + group_offset_lhs, + group_offset_out, alpha, beta, M=M, diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index dca1a3d0eb..f3eb8618b2 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -522,7 +522,7 @@ JAXX_GroupedTensorWrapper make_grouped_tensor(Buffer_Type const& data, std::opti Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type lhs_sinv, Buffer_Type rhs_data, Buffer_Type rhs_sinv, Buffer_Type bias, - Buffer_Type group_sizes, Buffer_Type group_offsets, + Buffer_Type group_sizes, Buffer_Type group_offset_lhs, Buffer_Type group_offset_out, Buffer_Type alpha, Buffer_Type beta, Result_Type output, Result_Type workspace, size_t m, size_t n, size_t k, bool lhs_is_trans, @@ -658,42 +658,20 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type printf("Num gemms: %zu, M: %zu, N: %zu, K: %zu, group_sizes: %zu, lhs_is_trans: %d, rhs_is_trans: %d, is_grouped_dense_wgrad: %d\n", num_gemms, m, n, k, group_sizes.dimensions()[0] / 2, lhs_is_trans, rhs_is_trans, is_grouped_dense_wgrad); //// RHS - NVTEShape rhsShape{.data={k, n}, .ndim=2}; - if (rhs_is_trans && !is_grouped_dense_wgrad) { - std::swap(rhsShape.data[0], rhsShape.data[1]); - } - if (!is_grouped_dense_wgrad) { - // If is_grouped_dense_wgrad, then n already includes num_gemms (G) pre-multiplied in gemm.py, so we don't need to multiply it here. - rhsShape.data[0] *= num_gemms; - } + NVTEShape rhsShape{.data={num_gemms*k, n}, .ndim=2}; + // rhs_is_trans = true; auto rhs_tensor = make_grouped_tensor(rhs_data, rhs_sinv, scaling_mode, num_gemms, rhsShape); - // if (is_grouped_dense_wgrad) { - // rhs_tensor.set_group_info(group_sizes, group_offsets); - // } //// LHS NVTEShape lhsShape{.data={m, k}, .ndim=2}; - if (lhs_is_trans && is_grouped_dense_wgrad) { - std::swap(lhsShape.data[0], lhsShape.data[1]); - } - if (is_grouped_dense_wgrad) { - // If is_grouped_dense_wgrad, then m already includes num_gemms (G) pre-multiplied in gemm.py, so we don't need to multiply it here. - lhsShape.data[0] *= num_gemms; - } + lhs_is_trans = true; auto lhs_tensor = make_grouped_tensor(lhs_data, lhs_sinv, scaling_mode, num_gemms, lhsShape); - if (!is_grouped_dense_wgrad) { - lhs_tensor.set_group_info(group_sizes, group_offsets); - } + lhs_tensor.set_group_info(group_sizes, group_offset_lhs); //// OUTPUT NVTEShape outShape{.data={m, n}, .ndim=2}; - if (is_grouped_dense_wgrad) { - outShape.data[0] *= num_gemms; - } auto out_tensor = make_grouped_tensor(*output, std::nullopt, JAXX_Scaling_Mode::NO_SCALING, num_gemms, outShape); - if (!is_grouped_dense_wgrad) { - out_tensor.set_group_info(group_sizes, group_offsets); - } + out_tensor.set_group_info(group_sizes, group_offset_out); printf("rhs_shape: [%zu, %zu], lhs_shape: [%zu, %zu], out_shape: [%zu, %zu]\n", rhsShape.data[0], rhsShape.data[1], @@ -702,9 +680,6 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type printf("rhs_is_trans: %d, lhs_is_trans: %d\n", rhs_is_trans, lhs_is_trans); - // HACK: jberchtold FIXME - // cudaMemsetAsync(output->untyped_data(), 0xFF, output->size_bytes(), stream); - nvte_grouped_gemm( rhs_tensor, rhs_is_trans, lhs_tensor, lhs_is_trans, @@ -729,7 +704,8 @@ XLA_FFI_DEFINE_HANDLER_SYMBOL(GroupedGemmHandler, GroupedGemmFFI, .Arg() // rhs_sinv .Arg() // bias .Arg() // group_sizes - .Arg() // group_offset + .Arg() // group_offset_lhs + .Arg() // group_offset_out .Arg() // alpha .Arg() // beta .Ret() // output diff --git a/transformer_engine/jax/dense.py b/transformer_engine/jax/dense.py index 9db60d3bd8..990d686711 100644 --- a/transformer_engine/jax/dense.py +++ b/transformer_engine/jax/dense.py @@ -664,6 +664,9 @@ def _grouped_dense_bwd_rule( dbias = tex.grouped_dbias(grad, group_sizes) if use_bias else None dkernel_amax = None + # HACK: skip gradients for now + dgrad = jnp.zeros_like(dgrad) + wgrad = jnp.zeros_like(wgrad) return dgrad, wgrad, group_sizes_grad, dbias, dkernel_amax, quantizer_set From ac9eda7fd963414552bd6c2e08a3875e2afd4e4d Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Thu, 22 Jan 2026 16:04:59 -0800 Subject: [PATCH 63/98] more progress --- transformer_engine/jax/cpp_extensions/gemm.py | 2 +- .../jax/csrc/extensions/gemm.cpp | 30 +++++++++---------- transformer_engine/jax/dense.py | 2 +- transformer_engine/jax/flax/module.py | 2 +- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py index 9a277edbea..6284130273 100644 --- a/transformer_engine/jax/cpp_extensions/gemm.py +++ b/transformer_engine/jax/cpp_extensions/gemm.py @@ -2170,5 +2170,5 @@ def grouped_gemm( is_grouped_dense_wgrad=is_grouped_dense_wgrad, use_async_d2h_group_sizes=use_async_d2h_group_sizes, ) - print(f"GroupedGemm: {lhs_data.shape=}, {rhs_data.shape=}, {out.shape=}, {M=}, {N=}, {K_lhs=}, {lhs_is_trans=}, {rhs_is_trans=}, {contracting_dims=}") + # print(f"GroupedGemm: {lhs_data.shape=}, {rhs_data.shape=}, {out.shape=}, {M=}, {N=}, {K_lhs=}, {lhs_is_trans=}, {rhs_is_trans=}, {contracting_dims=}") return out diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index f3eb8618b2..cbf4eecc70 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -433,15 +433,15 @@ JAXX_GroupedTensorWrapper::JAXX_GroupedTensorWrapper(JAXX_Scaling_Mode scaling_m void JAXX_GroupedTensorWrapper::set_rowwise(Buffer_Type const& data, std::optional const& scale_inv) { - printf("set_rowwise data shape: XLA buffer shape: "); - for (auto dim : data.dimensions()) { - printf("%zu, ", dim); - } - printf("NVTEShape: "); - for (int i = 0; i < m_data_shape.ndim; ++i) { - printf("%d, ", m_data_shape.data[i]); - } - printf("\n"); + // printf("set_rowwise data shape: XLA buffer shape: "); + // for (auto dim : data.dimensions()) { + // printf("%zu, ", dim); + // } + // printf("NVTEShape: "); + // for (int i = 0; i < m_data_shape.ndim; ++i) { + // printf("%d, ", m_data_shape.data[i]); + // } + // printf("\n"); NVTEDType data_dtype = static_cast(convert_ffi_datatype_to_te_dtype(data.element_type())); m_data_tensor = NVTEBasicTensor{reinterpret_cast(data.untyped_data()), data_dtype, m_data_shape}; @@ -655,7 +655,7 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type convert_ffi_datatype_to_te_dtype(beta.element_type())); - printf("Num gemms: %zu, M: %zu, N: %zu, K: %zu, group_sizes: %zu, lhs_is_trans: %d, rhs_is_trans: %d, is_grouped_dense_wgrad: %d\n", num_gemms, m, n, k, group_sizes.dimensions()[0] / 2, lhs_is_trans, rhs_is_trans, is_grouped_dense_wgrad); + // printf("Num gemms: %zu, M: %zu, N: %zu, K: %zu, group_sizes: %zu, lhs_is_trans: %d, rhs_is_trans: %d, is_grouped_dense_wgrad: %d\n", num_gemms, m, n, k, group_sizes.dimensions()[0] / 2, lhs_is_trans, rhs_is_trans, is_grouped_dense_wgrad); //// RHS NVTEShape rhsShape{.data={num_gemms*k, n}, .ndim=2}; @@ -673,12 +673,12 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type auto out_tensor = make_grouped_tensor(*output, std::nullopt, JAXX_Scaling_Mode::NO_SCALING, num_gemms, outShape); out_tensor.set_group_info(group_sizes, group_offset_out); - printf("rhs_shape: [%zu, %zu], lhs_shape: [%zu, %zu], out_shape: [%zu, %zu]\n", - rhsShape.data[0], rhsShape.data[1], - lhsShape.data[0], lhsShape.data[1], - outShape.data[0], outShape.data[1]); + // printf("rhs_shape: [%zu, %zu], lhs_shape: [%zu, %zu], out_shape: [%zu, %zu]\n", + // rhsShape.data[0], rhsShape.data[1], + // lhsShape.data[0], lhsShape.data[1], + // outShape.data[0], outShape.data[1]); - printf("rhs_is_trans: %d, lhs_is_trans: %d\n", rhs_is_trans, lhs_is_trans); + // printf("rhs_is_trans: %d, lhs_is_trans: %d\n", rhs_is_trans, lhs_is_trans); nvte_grouped_gemm( rhs_tensor, rhs_is_trans, diff --git a/transformer_engine/jax/dense.py b/transformer_engine/jax/dense.py index 990d686711..45e361670f 100644 --- a/transformer_engine/jax/dense.py +++ b/transformer_engine/jax/dense.py @@ -665,7 +665,7 @@ def _grouped_dense_bwd_rule( dkernel_amax = None # HACK: skip gradients for now - dgrad = jnp.zeros_like(dgrad) + # dgrad = jnp.zeros_like(dgrad) wgrad = jnp.zeros_like(wgrad) return dgrad, wgrad, group_sizes_grad, dbias, dkernel_amax, quantizer_set diff --git a/transformer_engine/jax/flax/module.py b/transformer_engine/jax/flax/module.py index f9757d29b4..fc8f359c9a 100644 --- a/transformer_engine/jax/flax/module.py +++ b/transformer_engine/jax/flax/module.py @@ -1534,7 +1534,7 @@ def te_grouped_dot_general(generate_quantizer_set, x, kernel, group_sizes, **kwa kernel, group_sizes=group_sizes, contracting_dims=((1,), (1,)), - # quantizer_set=quantizer_set + quantizer_set=quantizer_set ) return out.reshape(target_out_shape) From efb68fc41b559d60e1b2f77630ebb4bc1d90791c Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Thu, 22 Jan 2026 16:24:29 -0800 Subject: [PATCH 64/98] wgrad --- transformer_engine/jax/cpp_extensions/gemm.py | 9 ++++- .../jax/csrc/extensions/gemm.cpp | 40 +++++++++++++++++++ transformer_engine/jax/dense.py | 3 -- 3 files changed, 47 insertions(+), 5 deletions(-) diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py index 6284130273..8e9e89bdf9 100644 --- a/transformer_engine/jax/cpp_extensions/gemm.py +++ b/transformer_engine/jax/cpp_extensions/gemm.py @@ -2128,8 +2128,13 @@ def grouped_gemm( # Compute group_offset as cumulative sum of group_sizes, starting with 0 group_offset = jnp.concatenate([jnp.array([0], dtype=jnp.int32), jnp.cumsum(group_sizes, dtype=jnp.int32)[:-1]]) - group_offset_lhs = group_offset * K_lhs # Offset is by number of elements total, not number of rows - group_offset_out = group_offset * N # Offset is by number of elements total, not number of rows + if is_grouped_dense_wgrad: + group_offset_lhs = group_offset * M # Offset is by number of elements total, not number of rows + # HACK: this is really the rhs in this case + group_offset_out = group_offset * N # Offset is by number of elements total, not number of rows + else: + group_offset_lhs = group_offset * K_lhs # Offset is by number of elements total, not number of rows + group_offset_out = group_offset * N # Offset is by number of elements total, not number of rows # jax.debug.print("group_sizes: {}, group_offset: {}", group_sizes, group_offset) # jax.debug.print("M={}, jnp.sum(group_sizes)={}, N={}, K_lhs={}", M, jnp.sum(group_sizes), N, K_lhs) diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index cbf4eecc70..8758cd5cab 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -657,6 +657,46 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type // printf("Num gemms: %zu, M: %zu, N: %zu, K: %zu, group_sizes: %zu, lhs_is_trans: %d, rhs_is_trans: %d, is_grouped_dense_wgrad: %d\n", num_gemms, m, n, k, group_sizes.dimensions()[0] / 2, lhs_is_trans, rhs_is_trans, is_grouped_dense_wgrad); + if (is_grouped_dense_wgrad) { + //// RHS + NVTEShape rhsShape{.data={k, n}, .ndim=2}; + // rhs_is_trans = true; + auto rhs_tensor = make_grouped_tensor(rhs_data, rhs_sinv, scaling_mode, num_gemms, rhsShape); + rhs_tensor.set_group_info(group_sizes, group_offset_out); + + //// LHS + NVTEShape lhsShape{.data={m, k}, .ndim=2}; + lhs_is_trans = false; + auto lhs_tensor = make_grouped_tensor(lhs_data, lhs_sinv, scaling_mode, num_gemms, lhsShape); + lhs_tensor.set_group_info(group_sizes, group_offset_lhs); + + //// OUTPUT + NVTEShape outShape{.data={num_gemms*m, n}, .ndim=2}; + auto out_tensor = make_grouped_tensor(*output, std::nullopt, JAXX_Scaling_Mode::NO_SCALING, num_gemms, outShape); + + // printf("rhs_shape: [%zu, %zu], lhs_shape: [%zu, %zu], out_shape: [%zu, %zu]\n", + // rhsShape.data[0], rhsShape.data[1], + // lhsShape.data[0], lhsShape.data[1], + // outShape.data[0], outShape.data[1]); + + // printf("rhs_is_trans: %d, lhs_is_trans: %d\n", rhs_is_trans, lhs_is_trans); + + nvte_grouped_gemm( + rhs_tensor, rhs_is_trans, + lhs_tensor, lhs_is_trans, + nullptr, + out_tensor, + alpha_tensor.data(), + beta_tensor.data(), + workspace_setup.data(), + workspace_cublas.data(), + nullptr, // config (use defaults) + stream); + return ffi_with_cuda_error_check(); + } + + // Nominal case for FWD or DGRAD + //// RHS NVTEShape rhsShape{.data={num_gemms*k, n}, .ndim=2}; // rhs_is_trans = true; diff --git a/transformer_engine/jax/dense.py b/transformer_engine/jax/dense.py index 45e361670f..9db60d3bd8 100644 --- a/transformer_engine/jax/dense.py +++ b/transformer_engine/jax/dense.py @@ -664,9 +664,6 @@ def _grouped_dense_bwd_rule( dbias = tex.grouped_dbias(grad, group_sizes) if use_bias else None dkernel_amax = None - # HACK: skip gradients for now - # dgrad = jnp.zeros_like(dgrad) - wgrad = jnp.zeros_like(wgrad) return dgrad, wgrad, group_sizes_grad, dbias, dkernel_amax, quantizer_set From 831de225877685acbf9da32a15a2ebf7f862e4d2 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Sat, 24 Jan 2026 11:44:52 -0800 Subject: [PATCH 65/98] fix int32 overflow messing up group offsets --- tests/jax/test_custom_call_compute.py | 58 +++++++++++++------ tests/jax/test_permutation.py | 4 +- .../jax/csrc/extensions/gemm.cpp | 32 +++++----- 3 files changed, 59 insertions(+), 35 deletions(-) diff --git a/tests/jax/test_custom_call_compute.py b/tests/jax/test_custom_call_compute.py index 7de2a0de52..cb99de3a9d 100644 --- a/tests/jax/test_custom_call_compute.py +++ b/tests/jax/test_custom_call_compute.py @@ -1768,6 +1768,7 @@ def ref_func(x, gamma, kernel_1, kernel_2, bias_1, bias_2): # (3, 192, 64, 96), (8, 16384, 14336, 4096), + # (8, 16384, 16384, 4096), # (8, 64, 32, 128), # (8, 64, 128, 256), ] @@ -1806,6 +1807,7 @@ def _generate_grouped_dense_input(self, dtype, input_shape, data_layout="NN", wi subkeys = jax.random.split(key, 4) n_groups, m, n, k = input_shape + m //= 32 group_sizes = jnp.sort(jax.random.randint(subkeys[0], (n_groups - 1,), 0, m)) group_sizes = jnp.concatenate([jnp.array([0]), group_sizes, jnp.array([m])]) group_sizes = jnp.diff(group_sizes) @@ -1815,8 +1817,8 @@ def _generate_grouped_dense_input(self, dtype, input_shape, data_layout="NN", wi group_sizes = group_sizes.at[1].set(0) # *32 to make sure that input shape works for MXFP8 - # group_sizes = group_sizes * 32 - # m = m * 32 + group_sizes = group_sizes * 32 + m = m * 32 # group_sizes = jnp.full((n_groups,), m // n_groups) assert group_sizes.sum() == m @@ -1836,27 +1838,45 @@ def _generate_grouped_dense_input(self, dtype, input_shape, data_layout="NN", wi return lhs, rhs, group_sizes, contracting_dims, bias - def _diff_to_image(self, a, b): + def _tensor_to_image(self, tensor, value_range=None): import numpy as np from PIL import Image - # Convert to numpy and compute diff - a_np = np.array(a) - b_np = np.array(b) - diff = np.abs(a_np - b_np) - - # Normalize diff to 0-255 range for visualization - diff_normalized = (diff - diff.min()) / (diff.max() - diff.min() + 1e-8) * 255 - diff_uint8 = diff_normalized.astype(np.uint8) - - # Create heatmap image - img = Image.fromarray(diff_uint8, mode='L') + # Convert to numpy + tensor_np = jnp.array(tensor, dtype=jnp.float32) + + # Replace NaNs with a large value for visualization + tensor_np = jnp.where(jnp.isnan(tensor_np), 5000, tensor_np) + + # Determine normalization range + if value_range is None: + min_val = tensor_np.min() + max_val = tensor_np.max() + else: + min_val, max_val = value_range + + # Normalize to 0-255 range for visualization + range_val = max_val - min_val + 1e-8 + normalized = jnp.clip((tensor_np - min_val) / range_val * 255, 0, 255) + + # Downsample by averaging 4x4 blocks + h, w = normalized.shape + new_h, new_w = h // 4, w // 4 + normalized = normalized[:new_h*4, :new_w*4] # Trim to multiple of 4 + normalized = normalized.reshape(new_h, 4, new_w, 4).mean(axis=(1, 3)) + normalized = np.array(normalized) + normalized_uint8 = normalized.astype(np.uint8) + + # Create grayscale image + img = Image.fromarray(normalized_uint8, mode='L') return img - def _assert_grouped_gemm_output(self, out, group_sizes, ref_list, dtype): - assert out.dtype == ref_list[0].dtype - # self._diff_to_image(out, ref_list).save('output_diff.png') - assert_allclose(out, ref_list, dtype=dtype) - + def _assert_grouped_gemm_output(self, out, group_sizes, ref, dtype): + assert out.dtype == ref.dtype + print(f"Group sizes [{jnp.sum(group_sizes)}]: {group_sizes}") + self._tensor_to_image(out, value_range=(jnp.min(ref), jnp.max(ref))).save('output_te.png') + self._tensor_to_image(ref, value_range=(jnp.min(ref), jnp.max(ref))).save('output_ref.png') + self._tensor_to_image(jnp.abs(out.astype(jnp.float32) - ref.astype(jnp.float32)), value_range=(jnp.min(ref), jnp.max(ref))).save('output_diff.png') + assert_allclose(out, ref, dtype=dtype) # ref_list = jnp.split(ref_list, jnp.cumulative_sum(group_sizes)[:-1], axis=0) # out_list = jnp.split(out, jnp.cumulative_sum(group_sizes)[:-1], axis=0) # print([o.shape for o in out_list]) diff --git a/tests/jax/test_permutation.py b/tests/jax/test_permutation.py index 5bb59c6ed5..df0308e5c7 100644 --- a/tests/jax/test_permutation.py +++ b/tests/jax/test_permutation.py @@ -753,12 +753,12 @@ def test_sort_chunks_by_index(self, num_splits, total_tokens, hidden_size, dtype @jax.jit def loss_fn(x): output, _ = sort_chunks_by_index(x, split_sizes, sorted_indices) - return jnp.sum(output**2) + return jnp.mean(output) @jax.jit def ref_loss_fn(x): output, _ = reference_sort_chunks_by_map(x, row_id_map, None, is_forward=True) - return jnp.sum(output**2) + return jnp.mean(output) # Test forward pass output, _ = sort_chunks_by_index(inp, split_sizes, sorted_indices) diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index 1fc6345655..d2b7b5e23b 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -417,7 +417,8 @@ class JAXX_GroupedTensorWrapper { ~JAXX_GroupedTensorWrapper() = default; void set_rowwise(Buffer_Type const& data, std::optional const& scale_inv); - void set_group_info(Buffer_Type const& group_sizes, Buffer_Type const& group_offsets); + void set_group_info(Buffer_Type const& group_sizes, Buffer_Type const& group_offsets, + NVTEGroupedTensorParam group_sizes_param_name); operator NVTEGroupedTensor() const { return m_grouped_tensor; } NVTEGroupedTensor const& get_grouped_tensor() const; @@ -481,24 +482,25 @@ void JAXX_GroupedTensorWrapper::set_rowwise(Buffer_Type const& data, } void JAXX_GroupedTensorWrapper::set_group_info(Buffer_Type const& group_sizes, - Buffer_Type const& group_offsets) { + Buffer_Type const& group_offsets, + NVTEGroupedTensorParam group_sizes_param_name) { NVTEDType sizes_dtype = static_cast(convert_ffi_datatype_to_te_dtype(group_sizes.element_type())); NVTEDType offsets_dtype = static_cast(convert_ffi_datatype_to_te_dtype(group_offsets.element_type())); - NVTE_CHECK(sizes_dtype == NVTEDType::kNVTEInt32, - "group_sizes must be of type int32."); - NVTE_CHECK(offsets_dtype == NVTEDType::kNVTEInt32, - "group_offsets must be of type int32."); + NVTE_CHECK(sizes_dtype == NVTEDType::kNVTEInt64, + "group_sizes must be of type int64."); + NVTE_CHECK(offsets_dtype == NVTEDType::kNVTEInt64, + "group_offsets must be of type int64."); // JAX only supports int32 but cuBLAS requires int64 so we pack two int32 into one int64 - size_t num_tensors = group_sizes.dimensions()[0] / 2; + size_t num_tensors = group_sizes.dimensions()[0]; NVTE_CHECK(group_sizes.dimensions().size() == 1, "group_sizes must be a 1D tensor with length equal to the number of tensors."); NVTE_CHECK(group_offsets.dimensions().size() == 1, "group_offsets must be a 1D tensor with length equal to the number of tensors."); - NVTE_CHECK(group_offsets.dimensions()[0] == 2 * num_tensors, + NVTE_CHECK(group_offsets.dimensions()[0] == num_tensors, "group_sizes and group_offsets must have the same number of elements."); NVTEShape shape{}; @@ -512,7 +514,7 @@ void JAXX_GroupedTensorWrapper::set_group_info(Buffer_Type const& group_sizes, NVTEDType::kNVTEInt64, shape}; - nvte_set_grouped_tensor_param(&m_grouped_tensor, kNVTEGroupedFirstDims, &m_sizes_tensor); + nvte_set_grouped_tensor_param(&m_grouped_tensor, group_sizes_param_name, &m_sizes_tensor); nvte_set_grouped_tensor_param(&m_grouped_tensor, kNVTEGroupedTensorOffsets, &m_offsets_tensor); } @@ -567,7 +569,7 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type auto bias_dtype = convert_ffi_datatype_to_te_dtype(bias.element_type()); NVTE_CHECK(group_sizes.dimensions().size() == 1); - size_t num_gemms = group_sizes.dimensions()[0] / 2; // JAX only supports int32 but cuBLAS requires int64 so we pack two int32 into one int64 + size_t num_gemms = group_sizes.dimensions()[0]; // It is weird that TE/Common GEMM only use colwise for MXFP8 const bool is_fp8_gemm = is_fp8_dtype(lhs_dtype); @@ -672,13 +674,13 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type NVTEShape rhsShape{.data={k, n}, .ndim=2}; // rhs_is_trans = true; auto rhs_tensor = make_grouped_tensor(rhs_data, rhs_sinv, scaling_mode, num_gemms, rhsShape); - rhs_tensor.set_group_info(group_sizes, group_offset_out); + rhs_tensor.set_group_info(group_sizes, group_offset_out, kNVTEGroupedFirstDims); //// LHS NVTEShape lhsShape{.data={m, k}, .ndim=2}; lhs_is_trans = false; auto lhs_tensor = make_grouped_tensor(lhs_data, lhs_sinv, scaling_mode, num_gemms, lhsShape); - lhs_tensor.set_group_info(group_sizes, group_offset_lhs); + lhs_tensor.set_group_info(group_sizes, group_offset_lhs, kNVTEGroupedLastDims); //// OUTPUT NVTEShape outShape{.data={num_gemms*m, n}, .ndim=2}; @@ -691,6 +693,8 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type // printf("rhs_is_trans: %d, lhs_is_trans: %d\n", rhs_is_trans, lhs_is_trans); + cudaMemsetAsync(output->untyped_data(), 0, output->size_bytes(), stream); + nvte_grouped_gemm( rhs_tensor, rhs_is_trans, lhs_tensor, lhs_is_trans, @@ -716,12 +720,12 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type NVTEShape lhsShape{.data={m, k}, .ndim=2}; lhs_is_trans = true; auto lhs_tensor = make_grouped_tensor(lhs_data, lhs_sinv, scaling_mode, num_gemms, lhsShape); - lhs_tensor.set_group_info(group_sizes, group_offset_lhs); + lhs_tensor.set_group_info(group_sizes, group_offset_lhs, kNVTEGroupedFirstDims); //// OUTPUT NVTEShape outShape{.data={m, n}, .ndim=2}; auto out_tensor = make_grouped_tensor(*output, std::nullopt, JAXX_Scaling_Mode::NO_SCALING, num_gemms, outShape); - out_tensor.set_group_info(group_sizes, group_offset_out); + out_tensor.set_group_info(group_sizes, group_offset_out, kNVTEGroupedFirstDims); // printf("rhs_shape: [%zu, %zu], lhs_shape: [%zu, %zu], out_shape: [%zu, %zu]\n", // rhsShape.data[0], rhsShape.data[1], From 0e16fef8ef98cd2c76b47dd13bf88667b290a0e4 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Sat, 24 Jan 2026 12:16:39 -0800 Subject: [PATCH 66/98] backup attempt at not needing JAX x64 --- transformer_engine/jax/cpp_extensions/gemm.py | 40 ++++-------------- .../jax/csrc/extensions/gemm.cpp | 41 ++++++++++++------- 2 files changed, 35 insertions(+), 46 deletions(-) diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py index 8f21bb42a2..71fc670aae 100644 --- a/transformer_engine/jax/cpp_extensions/gemm.py +++ b/transformer_engine/jax/cpp_extensions/gemm.py @@ -1475,8 +1475,7 @@ def abstract( rhs_scale_inv_aval, bias_aval, group_sizes_aval, - group_offset_lhs_aval, - group_offset_out_aval, + group_offset_aval, alpha, beta, *, @@ -1516,7 +1515,7 @@ def abstract( Returns: A jnp.ndarray containing the result of the grouped GEMM operation """ - del lhs_data_aval, rhs_data_aval, bias_aval, group_offset_out_aval + del lhs_data_aval, rhs_data_aval, bias_aval del K, lhs_is_trans, rhs_is_trans, has_bias, use_async_d2h_group_sizes # TODO(Phuong): move some shape checks from Cpp to here workspace_size = get_cublas_workspace_size_bytes() * num_cublas_streams @@ -1540,6 +1539,7 @@ def abstract( workspace_size += rhs_scale_inv_aval.size + mxfp8_scaling_sinv_alignment_padding workspace_size += 1024*1024 # HACK: properly make a workspace_setup buffer in addition to the workspace_cublas buffer + workspace_size += group_offset_aval.size * 8 * 2 # buffer for two int64 group offsets on device workspace_aval = jax.core.ShapedArray(shape=(workspace_size,), dtype=jnp.uint8) out_shape = (M, N) @@ -1592,8 +1592,7 @@ def impl( rhs_scale_inv, bias, group_sizes, - group_offset_lhs, - group_offset_out, + group_offset, alpha, beta, M, @@ -1615,8 +1614,7 @@ def impl( rhs_scale_inv, bias, group_sizes, - group_offset_lhs, - group_offset_out, + group_offset, alpha, beta, M=M, @@ -2128,29 +2126,8 @@ def grouped_gemm( # Compute group_offset as cumulative sum of group_sizes, starting with 0 group_offset = jnp.concatenate([jnp.array([0], dtype=jnp.int32), jnp.cumsum(group_sizes, dtype=jnp.int32)[:-1]]) - if is_grouped_dense_wgrad: - group_offset_lhs = group_offset * M # Offset is by number of elements total, not number of rows - # HACK: this is really the rhs in this case - group_offset_out = group_offset * N # Offset is by number of elements total, not number of rows - else: - group_offset_lhs = group_offset * K_lhs # Offset is by number of elements total, not number of rows - group_offset_out = group_offset * N # Offset is by number of elements total, not number of rows - - # jax.debug.print("group_sizes: {}, group_offset: {}", group_sizes, group_offset) - # jax.debug.print("M={}, jnp.sum(group_sizes)={}, N={}, K_lhs={}", M, jnp.sum(group_sizes), N, K_lhs) - # jax.debug.print("lhs_data.size={}, group_offset_lhs={}", lhs_data.size, group_offset_lhs) - # jax.debug.print("out_data.size=M*N={}, group_offset_out={}", M*N, group_offset_out) - - # print(f"{lhs_data.shape=}, {rhs_data.shape=}, {M=}, {N=}, {K_lhs=}") - - # Interlace zeros with group_sizes to upcast packed int32s to int64 - # This ensures proper alignment and prevents overflow issues - zeros = jnp.zeros_like(group_sizes, dtype=jnp.int32) - group_sizes = jnp.stack([group_sizes, zeros], axis=1).flatten() - group_offset_lhs = jnp.stack([group_offset_lhs, zeros], axis=1).flatten() - group_offset_out = jnp.stack([group_offset_out, zeros], axis=1).flatten() - - num_gemms = group_sizes.shape[0] // 2 # Due to interlaced zeros to support int64 + + num_gemms = group_sizes.shape[0] alpha = jnp.ones((num_gemms,), jnp.float32) beta = jnp.zeros((num_gemms,), jnp.float32) (out,) = GroupedGemmPrimitive.outer_primitive.bind( @@ -2160,8 +2137,7 @@ def grouped_gemm( rhs_scale_inv, bias, group_sizes, - group_offset_lhs, - group_offset_out, + group_offset, alpha, beta, M=M, diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index d2b7b5e23b..ba2999afc6 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -418,7 +418,9 @@ class JAXX_GroupedTensorWrapper { void set_rowwise(Buffer_Type const& data, std::optional const& scale_inv); void set_group_info(Buffer_Type const& group_sizes, Buffer_Type const& group_offsets, - NVTEGroupedTensorParam group_sizes_param_name); + NVTEGroupedTensorParam group_sizes_param_name, + size_t offset_stride, + TensorWrapper const& offset_scratch_int64); operator NVTEGroupedTensor() const { return m_grouped_tensor; } NVTEGroupedTensor const& get_grouped_tensor() const; @@ -483,18 +485,19 @@ void JAXX_GroupedTensorWrapper::set_rowwise(Buffer_Type const& data, void JAXX_GroupedTensorWrapper::set_group_info(Buffer_Type const& group_sizes, Buffer_Type const& group_offsets, - NVTEGroupedTensorParam group_sizes_param_name) { + NVTEGroupedTensorParam group_sizes_param_name, + size_t offset_stride, + TensorWrapper const& offset_scratch_int64) { NVTEDType sizes_dtype = static_cast(convert_ffi_datatype_to_te_dtype(group_sizes.element_type())); NVTEDType offsets_dtype = static_cast(convert_ffi_datatype_to_te_dtype(group_offsets.element_type())); - NVTE_CHECK(sizes_dtype == NVTEDType::kNVTEInt64, - "group_sizes must be of type int64."); - NVTE_CHECK(offsets_dtype == NVTEDType::kNVTEInt64, - "group_offsets must be of type int64."); + NVTE_CHECK(sizes_dtype == NVTEDType::kNVTEInt32, + "group_sizes must be of type int32."); + NVTE_CHECK(offsets_dtype == NVTEDType::kNVTEInt32, + "group_offsets must be of type int32."); - // JAX only supports int32 but cuBLAS requires int64 so we pack two int32 into one int64 size_t num_tensors = group_sizes.dimensions()[0]; NVTE_CHECK(group_sizes.dimensions().size() == 1, "group_sizes must be a 1D tensor with length equal to the number of tensors."); @@ -514,6 +517,9 @@ void JAXX_GroupedTensorWrapper::set_group_info(Buffer_Type const& group_sizes, NVTEDType::kNVTEInt64, shape}; + nvte_make_group_offsets_int64(offset_stride, &m_offsets_tensor, offset_scratch_int64.data(), + stream); + nvte_set_grouped_tensor_param(&m_grouped_tensor, group_sizes_param_name, &m_sizes_tensor); nvte_set_grouped_tensor_param(&m_grouped_tensor, kNVTEGroupedTensorOffsets, &m_offsets_tensor); } @@ -534,7 +540,7 @@ JAXX_GroupedTensorWrapper make_grouped_tensor(Buffer_Type const& data, std::opti Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type lhs_sinv, Buffer_Type rhs_data, Buffer_Type rhs_sinv, Buffer_Type bias, - Buffer_Type group_sizes, Buffer_Type group_offset_lhs, Buffer_Type group_offset_out, + Buffer_Type group_sizes, Buffer_Type group_offset, Buffer_Type alpha, Buffer_Type beta, Result_Type output, Result_Type workspace, size_t m, size_t n, size_t k, bool lhs_is_trans, @@ -674,13 +680,17 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type NVTEShape rhsShape{.data={k, n}, .ndim=2}; // rhs_is_trans = true; auto rhs_tensor = make_grouped_tensor(rhs_data, rhs_sinv, scaling_mode, num_gemms, rhsShape); - rhs_tensor.set_group_info(group_sizes, group_offset_out, kNVTEGroupedFirstDims); + TensorWrapper group_offsets_rhs(workspace_ptr + workspace_setup_size + workspace_size, + std::vector{num_gemms}, DType::kInt64); + rhs_tensor.set_group_info(group_sizes, group_offset_out, kNVTEGroupedFirstDims, n, group_offsets_rhs); //// LHS NVTEShape lhsShape{.data={m, k}, .ndim=2}; lhs_is_trans = false; auto lhs_tensor = make_grouped_tensor(lhs_data, lhs_sinv, scaling_mode, num_gemms, lhsShape); - lhs_tensor.set_group_info(group_sizes, group_offset_lhs, kNVTEGroupedLastDims); + TensorWrapper group_offsets_lhs(workspace_ptr + workspace_setup_size + workspace_size + sizeof(int64_t) * num_gemms, + std::vector{num_gemms}, DType::kInt64); + lhs_tensor.set_group_info(group_sizes, group_offset, kNVTEGroupedLastDims, k, group_offsets_lhs); //// OUTPUT NVTEShape outShape{.data={num_gemms*m, n}, .ndim=2}; @@ -720,12 +730,16 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type NVTEShape lhsShape{.data={m, k}, .ndim=2}; lhs_is_trans = true; auto lhs_tensor = make_grouped_tensor(lhs_data, lhs_sinv, scaling_mode, num_gemms, lhsShape); - lhs_tensor.set_group_info(group_sizes, group_offset_lhs, kNVTEGroupedFirstDims); + TensorWrapper group_offsets_lhs(workspace_ptr + workspace_setup_size + workspace_size, + std::vector{num_gemms}, DType::kInt64); + lhs_tensor.set_group_info(group_sizes, group_offset, kNVTEGroupedFirstDims, k, group_offsets_lhs); //// OUTPUT NVTEShape outShape{.data={m, n}, .ndim=2}; auto out_tensor = make_grouped_tensor(*output, std::nullopt, JAXX_Scaling_Mode::NO_SCALING, num_gemms, outShape); - out_tensor.set_group_info(group_sizes, group_offset_out, kNVTEGroupedFirstDims); + TensorWrapper group_offsets_out(workspace_ptr + workspace_setup_size + workspace_size + sizeof(int64_t) * num_gemms, + std::vector{num_gemms}, DType::kInt64); + out_tensor.set_group_info(group_sizes, group_offset, kNVTEGroupedFirstDims, n, group_offsets_out); // printf("rhs_shape: [%zu, %zu], lhs_shape: [%zu, %zu], out_shape: [%zu, %zu]\n", // rhsShape.data[0], rhsShape.data[1], @@ -758,8 +772,7 @@ XLA_FFI_DEFINE_HANDLER_SYMBOL(GroupedGemmHandler, GroupedGemmFFI, .Arg() // rhs_sinv .Arg() // bias .Arg() // group_sizes - .Arg() // group_offset_lhs - .Arg() // group_offset_out + .Arg() // group_offset .Arg() // alpha .Arg() // beta .Ret() // output From d1fe13d6ed5a734904326721804a52d3b19a5a74 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Sat, 24 Jan 2026 12:16:55 -0800 Subject: [PATCH 67/98] Revert "backup attempt at not needing JAX x64" This reverts commit 0e16fef8ef98cd2c76b47dd13bf88667b290a0e4. --- transformer_engine/jax/cpp_extensions/gemm.py | 40 ++++++++++++++---- .../jax/csrc/extensions/gemm.cpp | 41 +++++++------------ 2 files changed, 46 insertions(+), 35 deletions(-) diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py index 71fc670aae..8f21bb42a2 100644 --- a/transformer_engine/jax/cpp_extensions/gemm.py +++ b/transformer_engine/jax/cpp_extensions/gemm.py @@ -1475,7 +1475,8 @@ def abstract( rhs_scale_inv_aval, bias_aval, group_sizes_aval, - group_offset_aval, + group_offset_lhs_aval, + group_offset_out_aval, alpha, beta, *, @@ -1515,7 +1516,7 @@ def abstract( Returns: A jnp.ndarray containing the result of the grouped GEMM operation """ - del lhs_data_aval, rhs_data_aval, bias_aval + del lhs_data_aval, rhs_data_aval, bias_aval, group_offset_out_aval del K, lhs_is_trans, rhs_is_trans, has_bias, use_async_d2h_group_sizes # TODO(Phuong): move some shape checks from Cpp to here workspace_size = get_cublas_workspace_size_bytes() * num_cublas_streams @@ -1539,7 +1540,6 @@ def abstract( workspace_size += rhs_scale_inv_aval.size + mxfp8_scaling_sinv_alignment_padding workspace_size += 1024*1024 # HACK: properly make a workspace_setup buffer in addition to the workspace_cublas buffer - workspace_size += group_offset_aval.size * 8 * 2 # buffer for two int64 group offsets on device workspace_aval = jax.core.ShapedArray(shape=(workspace_size,), dtype=jnp.uint8) out_shape = (M, N) @@ -1592,7 +1592,8 @@ def impl( rhs_scale_inv, bias, group_sizes, - group_offset, + group_offset_lhs, + group_offset_out, alpha, beta, M, @@ -1614,7 +1615,8 @@ def impl( rhs_scale_inv, bias, group_sizes, - group_offset, + group_offset_lhs, + group_offset_out, alpha, beta, M=M, @@ -2126,8 +2128,29 @@ def grouped_gemm( # Compute group_offset as cumulative sum of group_sizes, starting with 0 group_offset = jnp.concatenate([jnp.array([0], dtype=jnp.int32), jnp.cumsum(group_sizes, dtype=jnp.int32)[:-1]]) - - num_gemms = group_sizes.shape[0] + if is_grouped_dense_wgrad: + group_offset_lhs = group_offset * M # Offset is by number of elements total, not number of rows + # HACK: this is really the rhs in this case + group_offset_out = group_offset * N # Offset is by number of elements total, not number of rows + else: + group_offset_lhs = group_offset * K_lhs # Offset is by number of elements total, not number of rows + group_offset_out = group_offset * N # Offset is by number of elements total, not number of rows + + # jax.debug.print("group_sizes: {}, group_offset: {}", group_sizes, group_offset) + # jax.debug.print("M={}, jnp.sum(group_sizes)={}, N={}, K_lhs={}", M, jnp.sum(group_sizes), N, K_lhs) + # jax.debug.print("lhs_data.size={}, group_offset_lhs={}", lhs_data.size, group_offset_lhs) + # jax.debug.print("out_data.size=M*N={}, group_offset_out={}", M*N, group_offset_out) + + # print(f"{lhs_data.shape=}, {rhs_data.shape=}, {M=}, {N=}, {K_lhs=}") + + # Interlace zeros with group_sizes to upcast packed int32s to int64 + # This ensures proper alignment and prevents overflow issues + zeros = jnp.zeros_like(group_sizes, dtype=jnp.int32) + group_sizes = jnp.stack([group_sizes, zeros], axis=1).flatten() + group_offset_lhs = jnp.stack([group_offset_lhs, zeros], axis=1).flatten() + group_offset_out = jnp.stack([group_offset_out, zeros], axis=1).flatten() + + num_gemms = group_sizes.shape[0] // 2 # Due to interlaced zeros to support int64 alpha = jnp.ones((num_gemms,), jnp.float32) beta = jnp.zeros((num_gemms,), jnp.float32) (out,) = GroupedGemmPrimitive.outer_primitive.bind( @@ -2137,7 +2160,8 @@ def grouped_gemm( rhs_scale_inv, bias, group_sizes, - group_offset, + group_offset_lhs, + group_offset_out, alpha, beta, M=M, diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index ba2999afc6..d2b7b5e23b 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -418,9 +418,7 @@ class JAXX_GroupedTensorWrapper { void set_rowwise(Buffer_Type const& data, std::optional const& scale_inv); void set_group_info(Buffer_Type const& group_sizes, Buffer_Type const& group_offsets, - NVTEGroupedTensorParam group_sizes_param_name, - size_t offset_stride, - TensorWrapper const& offset_scratch_int64); + NVTEGroupedTensorParam group_sizes_param_name); operator NVTEGroupedTensor() const { return m_grouped_tensor; } NVTEGroupedTensor const& get_grouped_tensor() const; @@ -485,19 +483,18 @@ void JAXX_GroupedTensorWrapper::set_rowwise(Buffer_Type const& data, void JAXX_GroupedTensorWrapper::set_group_info(Buffer_Type const& group_sizes, Buffer_Type const& group_offsets, - NVTEGroupedTensorParam group_sizes_param_name, - size_t offset_stride, - TensorWrapper const& offset_scratch_int64) { + NVTEGroupedTensorParam group_sizes_param_name) { NVTEDType sizes_dtype = static_cast(convert_ffi_datatype_to_te_dtype(group_sizes.element_type())); NVTEDType offsets_dtype = static_cast(convert_ffi_datatype_to_te_dtype(group_offsets.element_type())); - NVTE_CHECK(sizes_dtype == NVTEDType::kNVTEInt32, - "group_sizes must be of type int32."); - NVTE_CHECK(offsets_dtype == NVTEDType::kNVTEInt32, - "group_offsets must be of type int32."); + NVTE_CHECK(sizes_dtype == NVTEDType::kNVTEInt64, + "group_sizes must be of type int64."); + NVTE_CHECK(offsets_dtype == NVTEDType::kNVTEInt64, + "group_offsets must be of type int64."); + // JAX only supports int32 but cuBLAS requires int64 so we pack two int32 into one int64 size_t num_tensors = group_sizes.dimensions()[0]; NVTE_CHECK(group_sizes.dimensions().size() == 1, "group_sizes must be a 1D tensor with length equal to the number of tensors."); @@ -517,9 +514,6 @@ void JAXX_GroupedTensorWrapper::set_group_info(Buffer_Type const& group_sizes, NVTEDType::kNVTEInt64, shape}; - nvte_make_group_offsets_int64(offset_stride, &m_offsets_tensor, offset_scratch_int64.data(), - stream); - nvte_set_grouped_tensor_param(&m_grouped_tensor, group_sizes_param_name, &m_sizes_tensor); nvte_set_grouped_tensor_param(&m_grouped_tensor, kNVTEGroupedTensorOffsets, &m_offsets_tensor); } @@ -540,7 +534,7 @@ JAXX_GroupedTensorWrapper make_grouped_tensor(Buffer_Type const& data, std::opti Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type lhs_sinv, Buffer_Type rhs_data, Buffer_Type rhs_sinv, Buffer_Type bias, - Buffer_Type group_sizes, Buffer_Type group_offset, + Buffer_Type group_sizes, Buffer_Type group_offset_lhs, Buffer_Type group_offset_out, Buffer_Type alpha, Buffer_Type beta, Result_Type output, Result_Type workspace, size_t m, size_t n, size_t k, bool lhs_is_trans, @@ -680,17 +674,13 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type NVTEShape rhsShape{.data={k, n}, .ndim=2}; // rhs_is_trans = true; auto rhs_tensor = make_grouped_tensor(rhs_data, rhs_sinv, scaling_mode, num_gemms, rhsShape); - TensorWrapper group_offsets_rhs(workspace_ptr + workspace_setup_size + workspace_size, - std::vector{num_gemms}, DType::kInt64); - rhs_tensor.set_group_info(group_sizes, group_offset_out, kNVTEGroupedFirstDims, n, group_offsets_rhs); + rhs_tensor.set_group_info(group_sizes, group_offset_out, kNVTEGroupedFirstDims); //// LHS NVTEShape lhsShape{.data={m, k}, .ndim=2}; lhs_is_trans = false; auto lhs_tensor = make_grouped_tensor(lhs_data, lhs_sinv, scaling_mode, num_gemms, lhsShape); - TensorWrapper group_offsets_lhs(workspace_ptr + workspace_setup_size + workspace_size + sizeof(int64_t) * num_gemms, - std::vector{num_gemms}, DType::kInt64); - lhs_tensor.set_group_info(group_sizes, group_offset, kNVTEGroupedLastDims, k, group_offsets_lhs); + lhs_tensor.set_group_info(group_sizes, group_offset_lhs, kNVTEGroupedLastDims); //// OUTPUT NVTEShape outShape{.data={num_gemms*m, n}, .ndim=2}; @@ -730,16 +720,12 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type NVTEShape lhsShape{.data={m, k}, .ndim=2}; lhs_is_trans = true; auto lhs_tensor = make_grouped_tensor(lhs_data, lhs_sinv, scaling_mode, num_gemms, lhsShape); - TensorWrapper group_offsets_lhs(workspace_ptr + workspace_setup_size + workspace_size, - std::vector{num_gemms}, DType::kInt64); - lhs_tensor.set_group_info(group_sizes, group_offset, kNVTEGroupedFirstDims, k, group_offsets_lhs); + lhs_tensor.set_group_info(group_sizes, group_offset_lhs, kNVTEGroupedFirstDims); //// OUTPUT NVTEShape outShape{.data={m, n}, .ndim=2}; auto out_tensor = make_grouped_tensor(*output, std::nullopt, JAXX_Scaling_Mode::NO_SCALING, num_gemms, outShape); - TensorWrapper group_offsets_out(workspace_ptr + workspace_setup_size + workspace_size + sizeof(int64_t) * num_gemms, - std::vector{num_gemms}, DType::kInt64); - out_tensor.set_group_info(group_sizes, group_offset, kNVTEGroupedFirstDims, n, group_offsets_out); + out_tensor.set_group_info(group_sizes, group_offset_out, kNVTEGroupedFirstDims); // printf("rhs_shape: [%zu, %zu], lhs_shape: [%zu, %zu], out_shape: [%zu, %zu]\n", // rhsShape.data[0], rhsShape.data[1], @@ -772,7 +758,8 @@ XLA_FFI_DEFINE_HANDLER_SYMBOL(GroupedGemmHandler, GroupedGemmFFI, .Arg() // rhs_sinv .Arg() // bias .Arg() // group_sizes - .Arg() // group_offset + .Arg() // group_offset_lhs + .Arg() // group_offset_out .Arg() // alpha .Arg() // beta .Ret() // output From 91d8d5d019c9d3603db52d67fce7f9a56e527adb Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Sat, 24 Jan 2026 12:29:24 -0800 Subject: [PATCH 68/98] fwd grouped gemm tests passing --- tests/jax/test_custom_call_compute.py | 140 +++++++++--------- transformer_engine/jax/cpp_extensions/gemm.py | 18 +-- .../jax/cpp_extensions/quantization.py | 2 +- .../jax/csrc/extensions/gemm.cpp | 3 - transformer_engine/jax/quantize/quantizer.py | 2 +- 5 files changed, 78 insertions(+), 87 deletions(-) diff --git a/tests/jax/test_custom_call_compute.py b/tests/jax/test_custom_call_compute.py index cb99de3a9d..196302a29a 100644 --- a/tests/jax/test_custom_call_compute.py +++ b/tests/jax/test_custom_call_compute.py @@ -1967,76 +1967,76 @@ def _primitive_sum_grouped_dense( ) return jnp.sum(jnp.asarray(out)) / jnp.sqrt(x.size) - # @pytest_parametrize_wrapper("dtype", [jnp.bfloat16, jnp.float16]) - # def test_grouped_dense_grad_fp16(self, dtype, input_shape): - # x, kernel, group_sizes, contracting_dims, bias = self._generate_grouped_dense_input( - # dtype, - # input_shape, - # with_bias=False, - # ) - - # value_n_grad_ref_func = value_and_grad(self._ref_sum_grouped_dense, (0, 1, 2)) - # # jitting the grouped_dense - # value_n_grad_prim_func = jit( - # value_and_grad(self._primitive_sum_grouped_dense, (0, 1, 2)), static_argnums=(4,) - # ) - - # ref_out_sum, (ref_dgrad, ref_wgrad, ref_dbias) = value_n_grad_ref_func( - # x, kernel, bias, group_sizes, contracting_dims - # ) - # prim_out_sum, (prim_dgrad, prim_wgrad, prim_dbias) = value_n_grad_prim_func( - # x, kernel, bias, group_sizes, contracting_dims - # ) - - # assert_allclose(prim_out_sum, ref_out_sum, dtype=dtype) - # assert_allclose(prim_dgrad, ref_dgrad, dtype=dtype) - # assert_allclose(prim_wgrad, ref_wgrad, dtype=dtype) - # # assert_allclose(prim_dbias, ref_dbias, dtype=dtype) - - # @pytest.mark.skipif(not is_fp8_supported, reason=fp8_unsupported_reason) - # @pytest.mark.parametrize( - # "fwd_bwd_dtype", - # [(jnp.float8_e4m3fn, jnp.float8_e4m3fn), (jnp.float8_e4m3fn, jnp.float8_e5m2)], - # ) - # @pytest_parametrize_wrapper("scaling_mode", grouped_gemm_supported_scaling_modes) - # def test_grouped_dense_grad_fp8(self, fwd_bwd_dtype, scaling_mode, input_shape): - # fwd_dtype, bwd_dtype = fwd_bwd_dtype - # dtype = jnp.bfloat16 - # x, kernel, group_sizes, contracting_dims, bias = self._generate_grouped_dense_input( - # dtype, - # input_shape, - # with_bias=False, - # ) - - # quantizer_set = QuantizerFactory.create_set( - # scaling_mode=scaling_mode, - # fwd_dtype=fwd_dtype, - # bwd_dtype=bwd_dtype, - # is_2x2x=True, - # n_groups=group_sizes.size, - # ) - # value_n_grad_ref_func = value_and_grad(self._ref_sum_grouped_dense, (0, 1, 2)) - - # # jitting the grouped_dense - # value_n_grad_prim_func = jit( - # value_and_grad(self._primitive_sum_grouped_dense, (0, 1, 2)), static_argnums=(4,) - # ) - - # ref_out_sum, (ref_dgrad, ref_wgrad, ref_dbias) = value_n_grad_ref_func( - # x, - # kernel, - # bias, - # group_sizes, - # contracting_dims, - # ) - # prim_out_sum, (prim_dgrad, prim_wgrad, prim_dbias) = value_n_grad_prim_func( - # x, kernel, bias, group_sizes, contracting_dims, quantizer_set=quantizer_set - # ) - - # assert_allclose(prim_out_sum, ref_out_sum, dtype=fwd_dtype) - # assert_allclose(prim_dgrad, ref_dgrad, dtype=bwd_dtype) - # assert_allclose(prim_wgrad, ref_wgrad, dtype=bwd_dtype) - # # assert_allclose(prim_dbias, ref_dbias, dtype=dtype) + @pytest_parametrize_wrapper("dtype", [jnp.bfloat16, jnp.float16]) + def test_grouped_dense_grad_fp16(self, dtype, input_shape): + x, kernel, group_sizes, contracting_dims, bias = self._generate_grouped_dense_input( + dtype, + input_shape, + with_bias=False, + ) + + value_n_grad_ref_func = value_and_grad(self._ref_sum_grouped_dense, (0, 1, 2)) + # jitting the grouped_dense + value_n_grad_prim_func = jit( + value_and_grad(self._primitive_sum_grouped_dense, (0, 1, 2)), static_argnums=(4,) + ) + + ref_out_sum, (ref_dgrad, ref_wgrad, ref_dbias) = value_n_grad_ref_func( + x, kernel, bias, group_sizes, contracting_dims + ) + prim_out_sum, (prim_dgrad, prim_wgrad, prim_dbias) = value_n_grad_prim_func( + x, kernel, bias, group_sizes, contracting_dims + ) + + assert_allclose(prim_out_sum, ref_out_sum, dtype=dtype) + assert_allclose(prim_dgrad, ref_dgrad, dtype=dtype) + assert_allclose(prim_wgrad, ref_wgrad, dtype=dtype) + # assert_allclose(prim_dbias, ref_dbias, dtype=dtype) + + @pytest.mark.skipif(not is_fp8_supported, reason=fp8_unsupported_reason) + @pytest.mark.parametrize( + "fwd_bwd_dtype", + [(jnp.float8_e4m3fn, jnp.float8_e4m3fn), (jnp.float8_e4m3fn, jnp.float8_e5m2)], + ) + @pytest_parametrize_wrapper("scaling_mode", grouped_gemm_supported_scaling_modes) + def test_grouped_dense_grad_fp8(self, fwd_bwd_dtype, scaling_mode, input_shape): + fwd_dtype, bwd_dtype = fwd_bwd_dtype + dtype = jnp.bfloat16 + x, kernel, group_sizes, contracting_dims, bias = self._generate_grouped_dense_input( + dtype, + input_shape, + with_bias=False, + ) + + quantizer_set = QuantizerFactory.create_set( + scaling_mode=scaling_mode, + fwd_dtype=fwd_dtype, + bwd_dtype=bwd_dtype, + is_2x2x=True, + n_groups=group_sizes.size, + ) + value_n_grad_ref_func = value_and_grad(self._ref_sum_grouped_dense, (0, 1, 2)) + + # jitting the grouped_dense + value_n_grad_prim_func = jit( + value_and_grad(self._primitive_sum_grouped_dense, (0, 1, 2)), static_argnums=(4,) + ) + + ref_out_sum, (ref_dgrad, ref_wgrad, ref_dbias) = value_n_grad_ref_func( + x, + kernel, + bias, + group_sizes, + contracting_dims, + ) + prim_out_sum, (prim_dgrad, prim_wgrad, prim_dbias) = value_n_grad_prim_func( + x, kernel, bias, group_sizes, contracting_dims, quantizer_set=quantizer_set + ) + + assert_allclose(prim_out_sum, ref_out_sum, dtype=fwd_dtype) + assert_allclose(prim_dgrad, ref_dgrad, dtype=bwd_dtype) + assert_allclose(prim_wgrad, ref_wgrad, dtype=bwd_dtype) + # assert_allclose(prim_dbias, ref_dbias, dtype=dtype) @pytest_parametrize_wrapper('eqn,a_shape,b_shape', [ # ('ij,jk->ik', (64, 32), (32, 128)), diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py index 8f21bb42a2..5eb5d9079b 100644 --- a/transformer_engine/jax/cpp_extensions/gemm.py +++ b/transformer_engine/jax/cpp_extensions/gemm.py @@ -1544,7 +1544,7 @@ def abstract( out_shape = (M, N) if is_grouped_dense_wgrad: - num_tensors = group_sizes_aval.size // 2 # packed int32 -> logical int64 shape + num_tensors = group_sizes_aval.size out_shape = (num_tensors, M, N) out_aval = jax.core.ShapedArray(shape=out_shape, dtype=out_dtype) return (out_aval, workspace_aval) @@ -2126,12 +2126,13 @@ def grouped_gemm( bias = jnp.empty((), jnp.float32) if bias is None else bias + group_sizes = group_sizes.astype(jnp.int64) # Compute group_offset as cumulative sum of group_sizes, starting with 0 - group_offset = jnp.concatenate([jnp.array([0], dtype=jnp.int32), jnp.cumsum(group_sizes, dtype=jnp.int32)[:-1]]) + group_offset = jnp.concatenate([jnp.array([0], dtype=jnp.int64), jnp.cumsum(group_sizes, dtype=jnp.int64)[:-1]]) if is_grouped_dense_wgrad: group_offset_lhs = group_offset * M # Offset is by number of elements total, not number of rows - # HACK: this is really the rhs in this case - group_offset_out = group_offset * N # Offset is by number of elements total, not number of rows + # HACK: this _out is really the rhs in this case + group_offset_out = group_offset * 1 # Offset is by number of elements total, not number of rows else: group_offset_lhs = group_offset * K_lhs # Offset is by number of elements total, not number of rows group_offset_out = group_offset * N # Offset is by number of elements total, not number of rows @@ -2143,14 +2144,7 @@ def grouped_gemm( # print(f"{lhs_data.shape=}, {rhs_data.shape=}, {M=}, {N=}, {K_lhs=}") - # Interlace zeros with group_sizes to upcast packed int32s to int64 - # This ensures proper alignment and prevents overflow issues - zeros = jnp.zeros_like(group_sizes, dtype=jnp.int32) - group_sizes = jnp.stack([group_sizes, zeros], axis=1).flatten() - group_offset_lhs = jnp.stack([group_offset_lhs, zeros], axis=1).flatten() - group_offset_out = jnp.stack([group_offset_out, zeros], axis=1).flatten() - - num_gemms = group_sizes.shape[0] // 2 # Due to interlaced zeros to support int64 + num_gemms = group_sizes.shape[0] # Due to interlaced zeros to support int64 alpha = jnp.ones((num_gemms,), jnp.float32) beta = jnp.zeros((num_gemms,), jnp.float32) (out,) = GroupedGemmPrimitive.outer_primitive.bind( diff --git a/transformer_engine/jax/cpp_extensions/quantization.py b/transformer_engine/jax/cpp_extensions/quantization.py index e816132791..535e39d60e 100644 --- a/transformer_engine/jax/cpp_extensions/quantization.py +++ b/transformer_engine/jax/cpp_extensions/quantization.py @@ -1245,7 +1245,7 @@ def grouped_quantize( ) = GroupedQuantizePrimitive.outer_primitive.bind( x, scale, - group_sizes, + group_sizes.astype(jnp.int32), out_dtype=quantizer.q_dtype, scaling_mode=quantizer.scaling_mode.value, q_layout=q_layout, diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index d2b7b5e23b..bed9188f72 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -494,7 +494,6 @@ void JAXX_GroupedTensorWrapper::set_group_info(Buffer_Type const& group_sizes, NVTE_CHECK(offsets_dtype == NVTEDType::kNVTEInt64, "group_offsets must be of type int64."); - // JAX only supports int32 but cuBLAS requires int64 so we pack two int32 into one int64 size_t num_tensors = group_sizes.dimensions()[0]; NVTE_CHECK(group_sizes.dimensions().size() == 1, "group_sizes must be a 1D tensor with length equal to the number of tensors."); @@ -693,8 +692,6 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type // printf("rhs_is_trans: %d, lhs_is_trans: %d\n", rhs_is_trans, lhs_is_trans); - cudaMemsetAsync(output->untyped_data(), 0, output->size_bytes(), stream); - nvte_grouped_gemm( rhs_tensor, rhs_is_trans, lhs_tensor, lhs_is_trans, diff --git a/transformer_engine/jax/quantize/quantizer.py b/transformer_engine/jax/quantize/quantizer.py index f5ca6aeaed..1923932692 100644 --- a/transformer_engine/jax/quantize/quantizer.py +++ b/transformer_engine/jax/quantize/quantizer.py @@ -68,7 +68,7 @@ def compute_scale_from_amax( sf = jnp.where(amax > 0.0, sf, scale) sf = jnp.where(jnp.isfinite(amax), sf, scale) assert sf.shape == (1,), f"Expected sf.shape == (1,), but got {sf.shape}" - return sf + return sf.astype(jnp.float32) @register_pytree_node_class From 1f6283f3840edcd60b3f9f35a2a494e1257e4bbc Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Sat, 24 Jan 2026 13:07:14 -0800 Subject: [PATCH 69/98] dgrad and wgrad working now --- tests/jax/test_custom_call_compute.py | 29 +++++++++++++++---- .../jax/csrc/extensions/gemm.cpp | 8 +++-- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/tests/jax/test_custom_call_compute.py b/tests/jax/test_custom_call_compute.py index 196302a29a..07db2b4d88 100644 --- a/tests/jax/test_custom_call_compute.py +++ b/tests/jax/test_custom_call_compute.py @@ -1873,9 +1873,9 @@ def _tensor_to_image(self, tensor, value_range=None): def _assert_grouped_gemm_output(self, out, group_sizes, ref, dtype): assert out.dtype == ref.dtype print(f"Group sizes [{jnp.sum(group_sizes)}]: {group_sizes}") - self._tensor_to_image(out, value_range=(jnp.min(ref), jnp.max(ref))).save('output_te.png') - self._tensor_to_image(ref, value_range=(jnp.min(ref), jnp.max(ref))).save('output_ref.png') - self._tensor_to_image(jnp.abs(out.astype(jnp.float32) - ref.astype(jnp.float32)), value_range=(jnp.min(ref), jnp.max(ref))).save('output_diff.png') + # self._tensor_to_image(out, value_range=(jnp.min(ref), jnp.max(ref))).save('output_te.png') + # self._tensor_to_image(ref, value_range=(jnp.min(ref), jnp.max(ref))).save('output_ref.png') + # self._tensor_to_image(jnp.abs(out.astype(jnp.float32) - ref.astype(jnp.float32)), value_range=(jnp.min(ref), jnp.max(ref))).save('output_diff.png') assert_allclose(out, ref, dtype=dtype) # ref_list = jnp.split(ref_list, jnp.cumulative_sum(group_sizes)[:-1], axis=0) # out_list = jnp.split(out, jnp.cumulative_sum(group_sizes)[:-1], axis=0) @@ -1956,7 +1956,7 @@ def _ref_sum_grouped_dense(self, x, kernel, bias, group_sizes, contracting_dims) # Note: we use jnp.sum instead of jnp.mean to make the gradient larger # and prevent them from being clamp to zero in FP8. / sqrt(x.size) is used to # normalize the output and prevent the gradient from being too large for FP8. - out_sum_list = [jnp.sum(out) for out in out_list] + out_sum_list = jnp.sum(out_list) # [jnp.sum(out) for out in out_list] return jnp.sum(jnp.asarray(out_sum_list)) / jnp.sqrt(x.size) def _primitive_sum_grouped_dense( @@ -1975,21 +1975,40 @@ def test_grouped_dense_grad_fp16(self, dtype, input_shape): with_bias=False, ) + print("Hi") + value_n_grad_ref_func = value_and_grad(self._ref_sum_grouped_dense, (0, 1, 2)) + print("Hi") + # jitting the grouped_dense value_n_grad_prim_func = jit( value_and_grad(self._primitive_sum_grouped_dense, (0, 1, 2)), static_argnums=(4,) ) + print("Hi") + ref_out_sum, (ref_dgrad, ref_wgrad, ref_dbias) = value_n_grad_ref_func( x, kernel, bias, group_sizes, contracting_dims ) + print("Hi") + prim_out_sum, (prim_dgrad, prim_wgrad, prim_dbias) = value_n_grad_prim_func( x, kernel, bias, group_sizes, contracting_dims ) + print("Hi") + + def write_images(prim, ref): + self._tensor_to_image(prim, value_range=(jnp.min(ref), jnp.max(ref))).save('output_te.png') + self._tensor_to_image(ref, value_range=(jnp.min(ref), jnp.max(ref))).save('output_ref.png') + self._tensor_to_image(jnp.abs(prim.astype(jnp.float32) - ref.astype(jnp.float32)), value_range=(jnp.min(ref), jnp.max(ref))).save('output_diff.png') assert_allclose(prim_out_sum, ref_out_sum, dtype=dtype) - assert_allclose(prim_dgrad, ref_dgrad, dtype=dtype) + assert_allclose(prim_dgrad, ref_dgrad, atol=0.015, rtol=0.75) + + # THE wgrad mismatch here is expected, the mismatch is always 1/n_groups because 1 of the groups is set to have 0 size, meaning the corresponding weight gradient is undefined (tho I should probably be setting it to zero manually) + + write_images( + prim_wgrad.reshape((prim_wgrad.size//prim_wgrad.shape[-1], prim_wgrad.shape[-1])), ref_wgrad.reshape((ref_wgrad.size//ref_wgrad.shape[-1], ref_wgrad.shape[-1]))) assert_allclose(prim_wgrad, ref_wgrad, dtype=dtype) # assert_allclose(prim_dbias, ref_dbias, dtype=dtype) diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index bed9188f72..835eae0442 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -666,7 +666,7 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type convert_ffi_datatype_to_te_dtype(beta.element_type())); - // printf("Num gemms: %zu, M: %zu, N: %zu, K: %zu, group_sizes: %zu, lhs_is_trans: %d, rhs_is_trans: %d, is_grouped_dense_wgrad: %d\n", num_gemms, m, n, k, group_sizes.dimensions()[0] / 2, lhs_is_trans, rhs_is_trans, is_grouped_dense_wgrad); + printf("Num gemms: %zu, M: %zu, N: %zu, K: %zu, group_sizes: %zu, lhs_is_trans: %d, rhs_is_trans: %d, is_grouped_dense_wgrad: %d\n", num_gemms, m, n, k, group_sizes.dimensions()[0], lhs_is_trans, rhs_is_trans, is_grouped_dense_wgrad); if (is_grouped_dense_wgrad) { //// RHS @@ -677,7 +677,7 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type //// LHS NVTEShape lhsShape{.data={m, k}, .ndim=2}; - lhs_is_trans = false; + lhs_is_trans = true; auto lhs_tensor = make_grouped_tensor(lhs_data, lhs_sinv, scaling_mode, num_gemms, lhsShape); lhs_tensor.set_group_info(group_sizes, group_offset_lhs, kNVTEGroupedLastDims); @@ -692,6 +692,10 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type // printf("rhs_is_trans: %d, lhs_is_trans: %d\n", rhs_is_trans, lhs_is_trans); + // Output needs to be zeroed in case any group sizes have size zero, meaning the expert weight isn't used in the fwd, meaning the corresponding output gradient should be zero. But using the grouped GEMM, the output buffer contains uninitialized data. + // TODO(jberchtold): make this memset smaller by only zeroing the expert weights that correspond to groups with size zero. + cudaMemsetAsync(output->untyped_data(), 0, output->size_bytes(), stream); + nvte_grouped_gemm( rhs_tensor, rhs_is_trans, lhs_tensor, lhs_is_trans, From 2eab97d50c07e6cd93c617b856fc5a12cfffe483 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Tue, 27 Jan 2026 14:29:57 -0800 Subject: [PATCH 70/98] backup --- tests/jax/test_custom_call_compute.py | 82 ++++++++++++------- transformer_engine/jax/cpp_extensions/gemm.py | 35 +++++++- .../jax/csrc/extensions/gemm.cpp | 18 +++- transformer_engine/jax/dense.py | 3 + transformer_engine/jax/flax/module.py | 2 +- 5 files changed, 105 insertions(+), 35 deletions(-) diff --git a/tests/jax/test_custom_call_compute.py b/tests/jax/test_custom_call_compute.py index 07db2b4d88..737b7e2e06 100644 --- a/tests/jax/test_custom_call_compute.py +++ b/tests/jax/test_custom_call_compute.py @@ -1782,6 +1782,11 @@ def ref_func(x, gamma, kernel_1, kernel_2, bias_1, bias_2): @pytest_parametrize_wrapper("input_shape", GROUPED_DENSE_INPUT_SHAPES) class TestGroupedDense: def _ref_grouped_dense(self, lhs, rhs, bias, group_sizes, contracting_dims): + lhs_cdims, rhs_cdims = contracting_dims + if lhs_cdims == (0,): + lhs = jnp.transpose(lhs, (1, 0)) + if rhs_cdims == (2,): + rhs = jnp.transpose(rhs, (0, 2, 1)) return jax.lax.ragged_dot(lhs, rhs, group_sizes) lhs_contract_dim, _ = contracting_dims assert len(lhs_contract_dim) == 1 and lhs.ndim == 2 and rhs.ndim == 3 @@ -1803,39 +1808,52 @@ def _ref_grouped_dense(self, lhs, rhs, bias, group_sizes, contracting_dims): return ref_out def _generate_grouped_dense_input(self, dtype, input_shape, data_layout="NN", with_bias=False): - key = jax.random.PRNGKey(0) - subkeys = jax.random.split(key, 4) - n_groups, m, n, k = input_shape + # key = jax.random.PRNGKey(0) + # subkeys = jax.random.split(key, 4) + # n_groups, m, n, k = input_shape + + # m //= 32 + # group_sizes = jnp.sort(jax.random.randint(subkeys[0], (n_groups - 1,), 0, m)) + # group_sizes = jnp.concatenate([jnp.array([0]), group_sizes, jnp.array([m])]) + # group_sizes = jnp.diff(group_sizes) + + # # Make one empty input lhs to test empty GEMM handling + # group_sizes = group_sizes.at[0].set(group_sizes[0] + group_sizes[1]) + # group_sizes = group_sizes.at[1].set(0) - m //= 32 - group_sizes = jnp.sort(jax.random.randint(subkeys[0], (n_groups - 1,), 0, m)) - group_sizes = jnp.concatenate([jnp.array([0]), group_sizes, jnp.array([m])]) - group_sizes = jnp.diff(group_sizes) + # # *32 to make sure that input shape works for MXFP8 + # group_sizes = group_sizes * 32 + # m = m * 32 - # Make one empty input lhs to test empty GEMM handling - group_sizes = group_sizes.at[0].set(group_sizes[0] + group_sizes[1]) - group_sizes = group_sizes.at[1].set(0) + # # group_sizes = jnp.full((n_groups,), m // n_groups) + # assert group_sizes.sum() == m - # *32 to make sure that input shape works for MXFP8 - group_sizes = group_sizes * 32 - m = m * 32 + # lhs_shape = (m if data_layout[0] == "N" else k, k if data_layout[0] == "N" else m) + # rhs_shape = (n_groups, k if data_layout[1] == "N" else n, n if data_layout[1] == "N" else k) + # bias_shape = (n_groups, n) - # group_sizes = jnp.full((n_groups,), m // n_groups) - assert group_sizes.sum() == m + # lhs = jax.random.uniform(subkeys[1], lhs_shape, dtype=dtype) / jnp.sqrt(k) + # rhs = jax.random.uniform(subkeys[2], rhs_shape, dtype=dtype) / jnp.sqrt(k) + # # rhs = jnp.concatenate([i/n_groups*jnp.identity(k, dtype=dtype).reshape(1, k, k) for i in range(n_groups)], axis=0) + # bias = jax.random.uniform(subkeys[3], bias_shape, dtype=dtype) if with_bias else None - lhs_shape = (m if data_layout[0] == "N" else k, k if data_layout[0] == "N" else m) - rhs_shape = (n_groups, k if data_layout[1] == "N" else n, n if data_layout[1] == "N" else k) - bias_shape = (n_groups, n) + def load_tensor(name: str): + import numpy as np + tensor = np.load(f'/mnt/jberchtold/polyphe-lustre-home/maxtext/gemm_{name}.npy') + return jnp.array(tensor) - lhs = jax.random.uniform(subkeys[1], lhs_shape, dtype=dtype) / jnp.sqrt(k) - rhs = jax.random.uniform(subkeys[2], rhs_shape, dtype=dtype) / jnp.sqrt(k) - # rhs = jnp.concatenate([i/n_groups*jnp.identity(k, dtype=dtype).reshape(1, k, k) for i in range(n_groups)], axis=0) - bias = jax.random.uniform(subkeys[3], bias_shape, dtype=dtype) if with_bias else None + lhs = load_tensor('lhs').astype(dtype) + rhs = load_tensor('rhs').astype(dtype) + bias = None + group_sizes = load_tensor('group_sizes').astype(jnp.int32) lhs_contracting_dim = (1,) if data_layout[0] == "N" else (0,) rhs_contracting_dim = (1,) if data_layout[1] == "N" else (2,) contracting_dims = (lhs_contracting_dim, rhs_contracting_dim) + print(f'{lhs.shape=}, {rhs.shape=}, {group_sizes=}, {contracting_dims=}') + # import pdb; pdb.set_trace() + return lhs, rhs, group_sizes, contracting_dims, bias def _tensor_to_image(self, tensor, value_range=None): @@ -1873,10 +1891,14 @@ def _tensor_to_image(self, tensor, value_range=None): def _assert_grouped_gemm_output(self, out, group_sizes, ref, dtype): assert out.dtype == ref.dtype print(f"Group sizes [{jnp.sum(group_sizes)}]: {group_sizes}") - # self._tensor_to_image(out, value_range=(jnp.min(ref), jnp.max(ref))).save('output_te.png') - # self._tensor_to_image(ref, value_range=(jnp.min(ref), jnp.max(ref))).save('output_ref.png') - # self._tensor_to_image(jnp.abs(out.astype(jnp.float32) - ref.astype(jnp.float32)), value_range=(jnp.min(ref), jnp.max(ref))).save('output_diff.png') - assert_allclose(out, ref, dtype=dtype) + self._tensor_to_image(out, value_range=(jnp.min(ref), jnp.max(ref))).save('output_te.png') + self._tensor_to_image(ref, value_range=(jnp.min(ref), jnp.max(ref))).save('output_ref.png') + self._tensor_to_image( + jnp.abs(out.astype(jnp.float32) - ref.astype(jnp.float32)), + value_range=(jnp.min(ref), jnp.max(ref)) + # value_range=(0, 0.5) + ).save('output_diff.png') + assert_allclose(out, ref, dtype=jnp.float32) # ref_list = jnp.split(ref_list, jnp.cumulative_sum(group_sizes)[:-1], axis=0) # out_list = jnp.split(out, jnp.cumulative_sum(group_sizes)[:-1], axis=0) # print([o.shape for o in out_list]) @@ -1890,7 +1912,7 @@ def _assert_grouped_gemm_output(self, out, group_sizes, ref, dtype): # ) @pytest_parametrize_wrapper("dtype", [jnp.bfloat16, jnp.float16]) - @pytest_parametrize_wrapper("layout", ["NN"]) + @pytest_parametrize_wrapper("layout", ["NT"]) def test_grouped_gemm_fp16(self, dtype, input_shape, layout): lhs, rhs, group_sizes, contracting_dims, _ = self._generate_grouped_dense_input( dtype, input_shape, layout @@ -2005,10 +2027,8 @@ def write_images(prim, ref): assert_allclose(prim_out_sum, ref_out_sum, dtype=dtype) assert_allclose(prim_dgrad, ref_dgrad, atol=0.015, rtol=0.75) - # THE wgrad mismatch here is expected, the mismatch is always 1/n_groups because 1 of the groups is set to have 0 size, meaning the corresponding weight gradient is undefined (tho I should probably be setting it to zero manually) - - write_images( - prim_wgrad.reshape((prim_wgrad.size//prim_wgrad.shape[-1], prim_wgrad.shape[-1])), ref_wgrad.reshape((ref_wgrad.size//ref_wgrad.shape[-1], ref_wgrad.shape[-1]))) + # write_images( + # prim_wgrad.reshape((prim_wgrad.size//prim_wgrad.shape[-1], prim_wgrad.shape[-1])), ref_wgrad.reshape((ref_wgrad.size//ref_wgrad.shape[-1], ref_wgrad.shape[-1]))) assert_allclose(prim_wgrad, ref_wgrad, dtype=dtype) # assert_allclose(prim_dbias, ref_dbias, dtype=dtype) diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py index 5eb5d9079b..3c5807a921 100644 --- a/transformer_engine/jax/cpp_extensions/gemm.py +++ b/transformer_engine/jax/cpp_extensions/gemm.py @@ -2169,5 +2169,38 @@ def grouped_gemm( is_grouped_dense_wgrad=is_grouped_dense_wgrad, use_async_d2h_group_sizes=use_async_d2h_group_sizes, ) - # print(f"GroupedGemm: {lhs_data.shape=}, {rhs_data.shape=}, {out.shape=}, {M=}, {N=}, {K_lhs=}, {lhs_is_trans=}, {rhs_is_trans=}, {contracting_dims=}") + if not is_grouped_dense_wgrad: + def my_callback(lhs, rhs, group_sizes, out): + if contracting_dims != ((1,), (2,)): + return + import numpy as np + lhs = np.array(lhs.astype(jnp.float32)) + rhs = np.array(rhs.astype(jnp.float32)) + group_sizes = np.array(group_sizes, dtype=group_sizes.dtype) + out = np.array(out.astype(jnp.float32)) + + lhs_is_nan = np.isnan(lhs).any() + rhs_is_nan = np.isnan(rhs).any() + out_is_nan = np.isnan(out).any() + inputs_are_nan = lhs_is_nan or rhs_is_nan + if inputs_are_nan or not out_is_nan: + return + print("GroupedGemm NAN detected! cdims:", contracting_dims) + np.save('gemm_lhs.npy', lhs) + np.save('gemm_rhs.npy', rhs) + np.save('gemm_group_sizes.npy', group_sizes) + return + + # jax.debug.callback(my_callback, + # lhs, rhs, group_sizes, out, + # ordered=True, partitioned=True) + + # jax.debug.print("group_sizes: {}, lhs=[amax={}, mean={}, stddev={}], rhs=[amax={}, mean={}, stddev={}], out=[amax={}, mean={}, stddev={}]", + # group_sizes, + # jnp.max(jnp.abs(lhs_data)), jnp.mean(lhs_data), jnp.std(lhs_data), + # jnp.max(jnp.abs(rhs_data)), jnp.mean(rhs_data), jnp.std(rhs_data), + # jnp.max(jnp.abs(out)), jnp.mean(out), jnp.std(out), + # ) + # jax.debug.print("group_sizes: {}, out_shape: {}", group_sizes, out.shape) + # print(f"GroupedGemm: {group_sizes.shape=}, {lhs_data.shape=}, {rhs_data.shape=}, {out.shape=}, {M=}, {N=}, {K_lhs=}, {lhs_is_trans=}, {rhs_is_trans=}, {contracting_dims=}") return out diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index 835eae0442..15d39e09a3 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -666,7 +666,7 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type convert_ffi_datatype_to_te_dtype(beta.element_type())); - printf("Num gemms: %zu, M: %zu, N: %zu, K: %zu, group_sizes: %zu, lhs_is_trans: %d, rhs_is_trans: %d, is_grouped_dense_wgrad: %d\n", num_gemms, m, n, k, group_sizes.dimensions()[0], lhs_is_trans, rhs_is_trans, is_grouped_dense_wgrad); + // printf("Num gemms: %zu, M: %zu, N: %zu, K: %zu, group_sizes: %zu, lhs_is_trans: %d, rhs_is_trans: %d, is_grouped_dense_wgrad: %d\n", num_gemms, m, n, k, group_sizes.dimensions()[0], lhs_is_trans, rhs_is_trans, is_grouped_dense_wgrad); if (is_grouped_dense_wgrad) { //// RHS @@ -715,11 +715,23 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type //// RHS NVTEShape rhsShape{.data={num_gemms*k, n}, .ndim=2}; // rhs_is_trans = true; + // if (rhs_is_trans) { + // std::swap(rhsShape.data[0], rhsShape.data[1]); + // } + NVTE_CHECK(!rhs_is_trans, "GroupedGemmFFI currently only supports rhs_is_trans=false"); auto rhs_tensor = make_grouped_tensor(rhs_data, rhs_sinv, scaling_mode, num_gemms, rhsShape); //// LHS NVTEShape lhsShape{.data={m, k}, .ndim=2}; - lhs_is_trans = true; + // NVTE_CHECK(lhs_is_trans, "GroupedGemmFFI currently only supports lhs_is_trans=true"); + // lhs_is_trans = true; + // if (!lhs_is_trans) { + // std::swap(lhsShape.data[0], lhsShape.data[1]); + // } + if (!lhs_is_trans) { + cudaMemsetAsync(output->untyped_data(), 0, output->size_bytes(), stream); + return ffi_with_cuda_error_check(); + } auto lhs_tensor = make_grouped_tensor(lhs_data, lhs_sinv, scaling_mode, num_gemms, lhsShape); lhs_tensor.set_group_info(group_sizes, group_offset_lhs, kNVTEGroupedFirstDims); @@ -735,6 +747,8 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type // printf("rhs_is_trans: %d, lhs_is_trans: %d\n", rhs_is_trans, lhs_is_trans); + cudaMemsetAsync(output->untyped_data(), 0, output->size_bytes(), stream); + nvte_grouped_gemm( rhs_tensor, rhs_is_trans, lhs_tensor, lhs_is_trans, diff --git a/transformer_engine/jax/dense.py b/transformer_engine/jax/dense.py index b5fd393230..ed1e5dfc38 100644 --- a/transformer_engine/jax/dense.py +++ b/transformer_engine/jax/dense.py @@ -664,6 +664,9 @@ def _grouped_dense_bwd_rule( dbias = tex.grouped_dbias(grad, group_sizes) if use_bias else None dkernel_amax = None + # HACK + dgrad = jnp.zeros_like(dgrad) + wgrad = jnp.zeros_like(wgrad) return dgrad, wgrad, group_sizes_grad, dbias, dkernel_amax, quantizer_set diff --git a/transformer_engine/jax/flax/module.py b/transformer_engine/jax/flax/module.py index ee84f74c48..82be17006c 100644 --- a/transformer_engine/jax/flax/module.py +++ b/transformer_engine/jax/flax/module.py @@ -1534,7 +1534,7 @@ def te_grouped_dot_general(generate_quantizer_set, x, kernel, group_sizes, **kwa kernel, group_sizes=group_sizes, contracting_dims=((1,), (1,)), - quantizer_set=quantizer_set + # quantizer_set=quantizer_set ) return out.reshape(target_out_shape) From 11ea298b15d4e54f575c5711c8da29172c61d643 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Mon, 2 Feb 2026 10:23:12 -0800 Subject: [PATCH 71/98] wip --- .../jax/encoder/test_single_gpu_encoder.py | 4 +- examples/jax/mnist/test_single_gpu_mnist.py | 4 +- output_diff.png | Bin 0 -> 23486 bytes output_ref.png | Bin 0 -> 4148 bytes output_te.png | Bin 0 -> 464907 bytes .../jax/csrc/extensions/gemm.cpp | 42 +++++++++++++----- transformer_engine/jax/dense.py | 2 +- 7 files changed, 36 insertions(+), 16 deletions(-) create mode 100644 output_diff.png create mode 100644 output_ref.png create mode 100644 output_te.png diff --git a/examples/jax/encoder/test_single_gpu_encoder.py b/examples/jax/encoder/test_single_gpu_encoder.py index 82c7fed38e..6d67296bd2 100644 --- a/examples/jax/encoder/test_single_gpu_encoder.py +++ b/examples/jax/encoder/test_single_gpu_encoder.py @@ -195,11 +195,11 @@ def get_datasets(max_seq_len): vocab = {} word_id = 0 - train_ds = load_dataset("glue", "cola", split="train") + train_ds = load_dataset("nyu-mll/glue", "cola", split="train") train_ds.set_format(type="np") train_ds, vocab, word_id = data_preprocess(train_ds, vocab, word_id, max_seq_len) - test_ds = load_dataset("glue", "cola", split="validation") + test_ds = load_dataset("nyu-mll/glue", "cola", split="validation") test_ds.set_format(type="np") test_ds, vocab, word_id = data_preprocess(test_ds, vocab, word_id, max_seq_len) return train_ds, test_ds, word_id diff --git a/examples/jax/mnist/test_single_gpu_mnist.py b/examples/jax/mnist/test_single_gpu_mnist.py index 0c76d51c37..ef85f4a7ab 100644 --- a/examples/jax/mnist/test_single_gpu_mnist.py +++ b/examples/jax/mnist/test_single_gpu_mnist.py @@ -146,7 +146,7 @@ def eval_model(state, test_ds, batch_size, var_collect): def get_datasets(): """Load MNIST train and test datasets into memory.""" - train_ds = load_dataset("mnist", split="train", trust_remote_code=True) + train_ds = load_dataset("ylecun/mnist", split="train", trust_remote_code=True) train_ds.set_format(type="np") batch_size = train_ds["image"].shape[0] shape = (batch_size, IMAGE_H, IMAGE_W, IMAGE_C) @@ -154,7 +154,7 @@ def get_datasets(): "image": train_ds["image"].astype(np.float32).reshape(shape) / 255.0, "label": train_ds["label"], } - test_ds = load_dataset("mnist", split="test", trust_remote_code=True) + test_ds = load_dataset("ylecun/mnist", split="test", trust_remote_code=True) test_ds.set_format(type="np") batch_size = test_ds["image"].shape[0] shape = (batch_size, IMAGE_H, IMAGE_W, IMAGE_C) diff --git a/output_diff.png b/output_diff.png new file mode 100644 index 0000000000000000000000000000000000000000..65d7f4547658e2a727485f8474d7d76d672c3b0f GIT binary patch literal 23486 zcmeIac|4Ts8$bS_Lr&zV(?X$g)ahiYl(j*til|g%FChkF9c9dnR-Fp1R2tGoSq5b{ zMk&%*D$6j8Q9>GHH-j-#?)F;8&=!yXrgyIb7PeYx|Ki4>3)ANUG9x z*Uk6i7QpZO-+%u_;J*m`7lHpG@LvS}i@<*o_%8zgMc}^({QoHe{?dtpe%#c&-lPNI zS0BuCcA_9l5VM_HAjQ3D|4%=)mA|g3GCzaJPu#%z=F}bYZnLkyul+F`bz_{xgAS^E z3ZC~=uljpms*|yBnH*%wv|PD;g=4%led+$7lF0p_J{8oFBDdIhGN3f z_z;&)Wrl-HKiBG|X;PD#ja~q!Y(m+DzXZP?wHS&y98n z7W`qNEy1Xw$2Nj+d}c_qI_~rRD(#LV$7Z*)=;*cldF_D#mpLKVyD$5;jc!{e@dS;m z{4*zLG7s$4pQFQae1S2L8e8A^>e9m#wSVL3M78jbZT`{&?Sm|M6=+sXeOqhnRyZ{N zzFM}41?ku$~|9 zoUC=F@*g9omX@UTQt*_WC%+sjRIT`eWWnFZ&K~~c2jcrWg?qFxqrbgzQTw~wPc#Ya zu~jLTf(2U=YLFb@?Tn3WT;3Nn{4(TXxHWyT@2=|Ir6_~4ghL-PFa*T`UOM&bd~gmw zQz?lrvI>zE$p=&^nRVw+_$=Yb#UK5k>l3`mS=#3U+Qx-5d|z3=b?SO~405vEM_BqX z2kK-HORm%ox2KR`R_h7*uH9q{%ExY}_&C_9D!t03`3ll)=X86OIIW}0>ColokueOC zU4Q%btF=F9RyZ6kdcBc~kMzJ=&nDe3qx!oOB#`Jwi4K7iSa3Z|Ct#ksICZlw!>dgh zwPF|T9)FjZ8nbZnN2J5KI4QIEO3ieT?WxBFEb~)fbIOkV@51;!ZYW~dZhe*vckrBs zZx@;qcoqd&>b{|Rvuf)%+mEs;syF4?AggX%eD*Bk8C!l>4J@F?KAj!+aC~cSb(NMv zixfx)upp47npLPxS}z0}6WL*wU++8rcCY)NLFQ%DzwLA#KhN?nqo$ql9yv&|?E*jj z&K+bWOpbToXE^qy!&T8Zp$oZjlzPGzk>Q|??gh)lwbtgTC)dFzytVODmvn^J+OfS_ z3Dkapl-wue+)wA8@%6w5u|{t)?hGoMSs}YKE=<_aVbONWREq(x-$T{LfOlIn-KqF@ zu+&~9QYPE{z9^KO6fTO01Xb%CMOp*_Syhj%uTEJY8*kI&Eq=-ilO3q*jmlp4aAgv^ zf)^$|a5N$-mOp(Krc?Y;OJ8kXTGyKP1h{sN&|?A>{Tm0&HIdrVO8=C2iGi#(a-<3x zLIvp7+tKy}Q+B3llzKVvI!w7jS!^&P!+fA=(Ib%IG4&<)*Jrc~ID4r_Du_i+=mkM> zG>xV^V<|%WQxt;^s&S2X7jaL&e_LEjc>FTef;dV@BjLdd)K-9PWHif}3CgsaD4H5fG=<3O6<;+|Ee+oLWOET8<7nO%(E4 z;Iljij*X^4i`)s0y2t%%IvQg6jC`Efq=72)o4O+0xsN_7w@f1M=F5;_1>Z*wFEhxi z`yu~{M$)YegwW5yD*jOSbzMyqWKDPSXt5a1)8?c0keJgBfIQdoU&`r8wui5={ z+=7=9?Y@yFu95g+-shEt>{=Dh{Rkv`7Ibc`AR?zPZO2~h%TtAr;?{HP8g9<-9H+AM zZl8Gy$gX;vi#>G!jepEZ)hLbf`ugu$^U2+G1$KwrBfRD|Lt#s2PR~t-4vnx@wE;Q( z3&+u6Z7-NMd;_@53K$14sNy~huh;*zi)SjE!Wii|Z;5>FnL1M4Fo^wVgGuk(REW1x zgP_Gq-ZNr!ioVQwo}8dkD^FBN^S`M~iPoG%lpf$0(bUka-ZLMvVpjy?Wdu4>CGiE# z1=ZhGUx2{LWPh$m)clOp($cXl(ChnTZ5S0CM@x6wBRd52T1TCnc-gha)$#c|&({HT zUdiHtz0-dOa~HPk{v5Q)@YSq?kV*vgsZBObdE`{71%Fx=b5fJNl`LUbmT)s8c_>)= zlShdVE@`)i*5tLh_sJ%Q5m4lh$&e+t4O(2$-_Y7Fhuf1HBB)g=t-)Xyp1cdxNXCH$ z-u=tE?AkNjp5Vu7`EZ7L!1#4dsnnt-(-TxpBz}qOV15g-uSO9lY{^ZOc!*6y<(L=s zsV}e%#6kY0U>S+;hBeIM_gP**dj=0~{mT~yU1jv?7`o%U)(VPeo!Zn_xp>>s zvkhg-Bv>xmo?`|Sa1hMn{EE9P(TD(GK;|Tgc`MP!Ng1KnhBmt*CuX;k{Q?jMWng zeS?kZ8h!l(YKf@U(fLwl&i^Vjg_rPofa(($HjMrass=ti`m=sd@8|DG9;PXrJ|sjJ z&_!2KQU(B2b`+K*W$B7w%(o|xM4iHlPVaim?KClCx;g*}EX$%8s z!As0(vVgRg*?b!zjCHH5@U0&Wa+)`3(?a0I)Fb$tK^Gd@6cuo`*zyBT_&vv7QYv-p z_&GIBl^339x^lLmGq7?KR2;5+y^(Y?gM9Lugc-YLFWu%|k@R$iAZ2h@P$G8yg zPTvdBQwTQwwrEW45LpCM%U*=)w53k5O41Pw!P;Z+?<{b^-9g4kxnAOREplzf6;lPZ zPAtR8?^l+9sz1?|*?MApYsAjH=ncVw_O|;gRrZSyw1~>>^I#MDgaLLL2=XLVo9O>_ z0nP>&<91p^s1<**InX-eJV1O!i5^XvsPW?O@vF+iByb&RdR5F;jZY*(1_?5gN zlnHHYb;&z#>{|QxaeD3eKc&Tb&$)Dm?a%MGssn%!9lyI6P_fj!gvXDylund=A;&U=+dLr*E^y4kE+8%GQb)*K0KvQM)pKbYWqOv1Bs_A(6;$!zlVPb5%{`9hM}7ZrApt} zSFEYvOh{A=cUwe=X)^0F7Q^Ac3c*)&5XXE2Ic3B^Es2Tn$D18rLB0-8D0}#K`lyW@wDTfbWraaF zpu}@0QAnMN%X&dKS)%>(x@+6gkU|qDP$Z8IeZp^WLlWlXRsXL)6p@FX)Uja>s%`+S znhD=z|0uzMy5zO^NiA~0{N=Q-I!5WMrlxq&PWsTcu1@k)9M#Y@rr)e=B*5jjdvs7l z%ZO7gKyikq=Tcl$o5zw5nZ7SXa6b3R58Vi`*p-;~T(|Wwk2PoC8x2qY+!Y zi^=zHyC%_ojn>(k-cm-%a}6i*LP0|Vx?Aj2GhgOjLznmGILI?lpt#}ryEl%+?EX^T zUY{=^Ql-i(Gb@JK%kmbjFCygHN_2^I$e()Dec2%mCHG#S^yM>r*K{MApny?j`k}ei zGRnBMcwaWM$MwFi=Iu2!{I$GCr0B#EOKT@JK0|~OK#KClHV(5evTv7@4=-88 zQG1<%Ym2gRH4MaVXn?FWAVu{!6T(Ej2g64r&LW|0sg6%PlAZlk&3pVX685&&{6Q*P zZWTkRQKrjxwLRTB!xQ?@P{=Z2O&F`%WMtI!Fg1RB|8c?-B}6DA_pY^V9I&*I>jRvupMf2 zBz6>su6@bszWOUI3mu&)D8Is3+}Dzhsn@y@)wP*@x(iE-f8N!Uia_YuFWq(o*)%tf zua==kOXwfti+Y|jod}9@Vfc5b<}AzCC)#tIgaUe^(dF|931wa8j_4@HLZ*{J{fVm< z*nHEq&tWV2{Y3R4b!m=vZfk~N(XXCBdO^wI=qgG_ENkoMiHMn-*%lM+NK?Q~LDr)5 ziA`8>(1miD@@A3i`@DrDkNppSa%|DY2k3d3Z%otHu^v~4kKY94z4X{mMDVD4Cv~kd z4&%EV69s-#m&y`zksGu+8u#@@ZKOmy&*E_->c^ST?*f)?4%ZA;p7OqyT5o9)o^JBE zJ==@?p!}Xlm*@2NP`caE?Fvnwk5LO>m!;?W`fw8KEclR%fBG z?~0pN?ZfYKE0`z?DBW1y$@FRRI=x)7K zU(rydS}GI%_-Rg-M{R@B1O=aU;Pzg~XpNs|(=vGCx;`QgeJ_$!kG$e8S~d9nN>NXk zvJTvy4~xVBl5xf*DAhwv+RK(n1O;A3Jvmya%Cjm}M^J|m-DV^qc-?HBbrs1RC+%fY zrNz0AxE*g>I|Easu&-;|#u&Eofp7K$3s!2c}n7Al)0#1gj2< z7ig5KVJKhtmJ#a*8x#NCaWNyYSzT05Y9Cvb$XaQj^ZDFa9F&aIxMw!AOqcQHfvp5L zK;W1^=`#@)&?y(+;5Ymay3IVN$h)qgb);}aV9D8B-MYs#uH|T}=k;wV$g{Qr;kW+o zc!{S@Wrb$r5wD$G943iJkZ|MpMe=`Ak4lMZXJdISbUnxO&SSQz(CrE4&IeE8)6*&| zK8-FmVr`Ac!&3)sqHTvbL_qURbwljkbm<6|iQ4EZ=!DaX_1JB~+orO9C~{k-*(oYV zAfFS0AfC~O&K)(?kpC(g>#V%myaGxZe=zPsn{Cdpo4$beqnpO#%enBp(h?t-a3PeSB9;-UQ>t>u8k%_ z>p-t^xrlSI#I-7$PA^B5;Q2J6p}U(jw9ey$m8ZuCKiw>I?^B=l)XPD*(=qiP2_-1F z%O+JsX;dGC59tu@6x1{nmnUjXnC)-MZ3W#?WaP-wEA->kx~TqDXvkODL0!s;TBd3d zK__SC-mg~|xoN;9%iuEKwpylJf2la!sxJl17SKj#O}>fxC#4 zpZ~HCJ@tZEjl_!H0wh7xlAKa^J4^Z(Japeb$)p0Ibh0@O$l1>|0i{EZjEuydX)iXYg-W71iu`D)| zy=jD!#kCV1=K^xf)H59^)aTd+KrG#*o_0KA=dDlK)KHxs8E2_8+-<|G1iaUqAd^L+ zmU@+pk+3C&kyuf^BCj5ICFh9KPTcWCayjVXKPFA_I z9S`@CMdO$rr$mwaV%=2++?Q~7WyhlT7FZsja~o?;+m&5(7-LVhKtPKFlEZot7&7*S^9fSm3w7YQqN9nxAm@4>JSu8)l+_V6qY={C$30N*C(+jsmdGXd}u7Mfu?c?rw` zZ)ofVQB84@9qo(c7PdV32w9X5D`>Ch z35mjBRUO>%3Sc&lJ>0^V8EYqI5JaX!S`vP8=W`7+RX|M?eEgzB8gHu5@mO!6gsbnu zCf)sYqfd(E46G1_7dOzyZUZoYb=S1~QqGf!_?&nYo`lWFmAXO{R7JV)0(B4+)O#vDSh?Km) zUf)k%oU@^&WQ%-8K$(+TIP72Ew;fMzZq2rj3#x{|9O3kE26-T&siD^pnw)r_ei5j* zVEj(~ScqivwDBnaJ2Q7z|5k|ZTTWO!4m zO*gea*Fq#+)Jyl3f-k2-=_MxbfzG4499@d4>Sxl)XkYN;bj=h#;huOu$okWXCFnSf zu^;;SC&w{VQnfNA^<5sOVomf{viw8L(n(I$L(c^;h8zi+BzMMGv}R1bmur8pg4>_V=xZ6Ky-^ya{19r;>}0d*j*@7nbrXGcZ<@mvtxS-b z+glN7E-n9rcqdE)2`vbZ98Nk<_Q8}9WWh}nu zR1>uEJjjlPpGT#vAFxC=o9Z=H{>D<8wb@6J7%dtzfZh@IhavmtzHT_Mv4tv;DI%PK z0BAY?&WH%rTI_2b{Xhl!z`j+s@BP6O&}m(lvv8LhU{Nde(!?aFpgNob*SlVS;eC>FVTpbQl$M~8r^7bIxPr}(=Hx@#}H128G>>_ohN#6}av@b9I3Cuo0 z>9yLHrQ(r^agw~MpcQ;t1|vo{=k7}Je4laXwI{J9SZMsJ+*>zG#=AfHeydYXk#wrO zT43SSqE)@XIF9NsBrXW;QG-@y!Wca2CHu)_vb)*gJ)>&r3~~Vg6XaM+m5zKht@XQ; zpvpg+8%meivirRxG9kC-7Np^kSCOSJ%<>M<<*w`?u>dq72(s2&GE z^i#n?ySN{`+hO;dyN^a>C}g(;%=e~yo7U7Xg>Nlr>GzH)Y|(MHU}sbddTxUO(l>tNF>HTRafriA6c;9Vn|+(e^=ocJ)4tNQ1PR3G~n zI8G!y6_vnk;Qk@PIq@w`$qaXQY;eHM_u@K9l*rFtm-j^_mh+i0hK7xd5!Dzkrmjz4 z%sS)875%P`?#>|>uSK-XryJ?{kAWeR(mCGq z`-Py9%%BM8m%wvKZmyf-Sc@}q>@#C+AR3f&B>VFzNOhGFe~yr*iVy<|`oreBtCAfSAKr!4H!5WiQP;BTfl}$K+LHZ-jcu8o$=kmNzsu^| zBncEOK}1hcON#3X9TQr205vnY*EcMcOTJXFH(L`^^g1hN3msZm3exgCVoI|>pISUe z54?%qA~oNnQMJC`IB{pNU^Na)c6vbx51Fv68M*yV8(K25T@^oK)J=z{*MDjpFTWMH*O$xTA|miC%5jc zvJRf|mFKM>8uBFwJGbyiZ0{8Hz7RLPM*TEVLVT2tTx1{fAk%`n5 zqTqZlp_2dU%@cz!i(D(6d`cI;iG!TfiocH$U!o4Gr#3n)Faeg=!bfI6_>P!y{0T)W z&E}wl!q9JIB1-LDIQ07r-MVUfhkG5*Tq03lpCi!3j1HD>_fuqUZV^{aet0Pw=V zWuX@WBbyFIs?Iee&3v=;T|V(`B07lea;rw{X#0av1>Hc%-?p=lk8LP%x#tGoUx4YK z6Q(t_HKJcg;aO+GmwXtR>~Wb0J3uRU^4S48l73IbfXH;>51w|a)sx!0tuavF$)%bh zZVbe|2x=Ku?~PXRN&qW}FE%X;>r*@MPZZwD>C^qr%d@E4k~zGqb~*C zMBRzHMkx2lDOF%#PUuLf%yuv9B};Kvm`5d|-Y#18 zc8hlP9i5!{Ok=y`?pP?tjX%G6=GlOHw2Q%8xmD5gJxLcxN$0PAqO2xG3;aqSXgb_- zhu=)&IQJ!6%8b)Qv!DG~MRqJi+jXz}ek*464xV1`h_gk+>}R&;Nk=fU$aD6x^teTY zd~$#->yT$4_0s&sTz`_PDYR>>;SztV+Tn|RJtG%loupD!aGI6b0rjVlkBhpE8UlMb zUZLc@zyaIe)8W>~#H>P*CF?I0V2rX+d`>kSC+}>Z=8d?{v06rtdM)(r%0}wmu*>_` z?%)*~sql}rqruORLpTaV}dMJqpa+ub6&cZanM*dlXISZ;zXqW4%;ClEA~ zD)Czs*Y*~LgcV7wdP}a6$hkD1S=V#P*#&ITfp>~Vdy7$NP7o~oD0)Ail_C6Ir8r2j z#^_bE+EiA!jAo`iPZ-y!wpA@$d-F+XKF(~D9UP^DbZ6-_75?9|qVxdO2uCTICpXF& zYW|D(?DYMUSO*d#)3fEgT}d0H$d%7rx~7ot=SU_*k!<8IUu*>6PL;F zNj2y4pM$~9#S5p3q<+#H=uZQ{(_WD^Wcc? zStPIHaeVHyiK?&jcpJ+V992>t>1_`&y^JEn7RrUb0o^ z?UtJFgawL)d54qLwIq-dVpdq|<}kAG7r~x6pao4)X-T__#Idy57UNER_ruv*FWSfm zExU2u%;V_fyyKGMdPY9J45bEAA{Gt=uE3GdbB+8=oR}ufue2oASaw+UpH4^+- z^hoqqqK~g^g}~A2{5rU(ho~9^r5ZzPqJ%mWbe@AT;|`kRlg(Br3)Eh_FK zDHn84_sD6*LY`}~I`jAN12gfsrD@#J2;QBWb8M!@eO8XPrD>SO%&#-J&DYxXqirpz zEmZa(T>d^Qk)v(%ZHAv|x;-nIob=JM1KTfpJ**VK$8#{5Za-)H8`w&axB09O33=5! z>P0{ak&S%1(_adna!TgM&4wGF`C!b<$lARxmJ(jK*x5$B?rqIg77_F7@`G(k?ttw- zA`uF~_FHy;5ThcTnr*+IN)T=RlkSm?iiP^G>o&3E?Pl*{Bn%v-16g#GBIH%MeIRSz zSSO|k&z{+Siq#Z%{M(O4snW+&?|ClTwcK7_v?lNBClslqhp`o`FDAj~_@aqy^_EvG zySAU0I^u(Zo`9oxXwsApL{AWbvQVfY(B) zW9x{jL(JMo!<9jLADdI>f^1KFxXw8yoyGw1EWY%Z&t;Qn=Y;?qT4pixk7hSO{gxfK zEXQwM19%=1MgX|Q{bP2O8@ZU2i`aW7Dd{a>ZwXR#;Qs*m5)tg6e{k|?p;b&sq9iCtVb*fuM%TjK;?&jitf;c#+i{7PKnNS7(v7W2v$`)!SLq1e;Omo54m~7VR~Fc%1of ztDJt?zC!Ww8y!?X2y6HAtCP5%w)T>fV=+guD|XIouG7jqUm7q=cLcodo)KLX9s0R{ zHXG$+D|#R-Xs{C|?;XKm%(+`@2wx7iwscNv<6n6mhg18~y&)Cni+vhDM<}z{S>rg? z6JchjS;c+X2i(EC=;io$y$evnq4`!+XUY2LboEmfk9+AVd`A~Cy0Kk8nu4hc0O?VC|YN0eKG*_5}^-9{B^%+7^D9 zX&i?49E>bqy9_f4f|I3I(V9DWa-_Iqpqiu85#f@mXF4rhqN~dvAVnn7Vl7#94wMy{ zvq)E^w-ZDoms2T@a?i&S7cZ((11h`Q0p6P^if=hsJogkWy16GBXj>6^gj9p?dzb_9xR&Q|x=;+U5!@keuf=}u2v8PTSJKzF)?mc8*j{p~d=A@qF zoD0je>oYRVkdLph(z7+4><05w7z7U|CVc@RH#Slkoo?X7UWm8&ultm?kW zk9r|pB^=oU_6(~5#_gPW`L@$_ipjC60NS&gi0>eUZmRK^=|PG_K!$dw4O2_TYl?;% zm+Aj%#KOTh(Q;;#K@F+x^w`e;Z z6)``vh$p0_8&SHUOg_0lmqu9=Y@O_>RRFgebQt_L+{3RakMF+-x~olVdqpYWWDMRX zGjT@DrC5=N|M^w;w=`S&<4VSc-~`C=iKP(Si#(~*ZI;t|K5>-8`+=mwL7S2?APX*% zoQm55)29g$@7P= zy>BkIh+q;!NXPq@B;g`CBe%Qbf_;|ER!3pRw?sWq&57EBNkC4tTbYOJyE@$%H}KP{ z1lo0$J@K1Jf8}JmZ|`~)4OpeWas~vBfOOCjfxwwT2jSO58sHO^SQU?!Ul?82^II2L z0tA0#HmB!RCxpoUG1KR^_8bBjEYjCV{jI>Efi~f<|JBeix3&0r#^lYj+svHkN$ms% zLEG(4XMiRpb$M~15HX7I3R9<7D08Z1Kc9pENuuyfF<=qkAN157+`u2bricTTW^7FI z0(Y;(?YPyKo@X!%;Ag9$M6zv@C#Tv2&_F=#{qKBY`YV^lKTi;{yp!GJV25fUtlAT- zOb2W3^hb-OaQPv6WyV&iu|YLTT$(3*oVLP8{G_PJGRF;k4J10s?mOmy&Ucaq_V zJ%Cec+yJDg?~JlE5e#;9)&y?I+^!d0=&4t}yHpEs%5V3ENM?X9dqxCvwS;XgY`VCM zwFLcH<4wTwH^7<*-1$gWZLuk-2f*n{N&sRrngb-oFBwM0f1NKbA?)Sf|JlL29BlMW z=0f=4S)GD&2p*gW?)Zk8mKbyxc8!Z%@6X`DWl|e@4j!2NKpMw}X}yqb&yCFD3ko{o zF^~Oj=D^BNw}1s}MfS+pERqE6Z(09NZnh8niL*>Em765PV6uD5s6h3stUtbRbnwFV z*i$$z-!2xn`-k%vE5Iv1-?s8#`-tm7YakaJ2}wx_VhIxlYYA{5`)Uswh7X$(g+R#_ z8j*s2Wx+y5PvR7zAlwm-(3(>;O7jXG6&8iHoQ34N1r*ra$nC3))i2=&?jJb@crfNvM~Z_d%QP3EXQm0RT%)u} zw=#6f*^|0W`|aeKGs)&iUcMn#%P0b1Mb+oTNuVoCW-0C`9x)*9(;8jnaUF&=|A;a7 z&)M)WF-fivNuQuAx<}3cVPTg1nfqp83g~rEBm(>mB}Nu~7Q$>NoDE+VLg^KXQc&dRZa1QIrOxj2d#|=RO+M%l zpJibvcO{b6s{n6;CeXGSGxaLGBb})aU7Xzd>OAZyF2NVVA^c|=LvZ(}Ru!w%uWarG zPl{V^*!3DZ3KrxmDoXK299*iv-~~3VB0N(f!0Fm0ca+Q#fB3pdSPEzUI$hw23<6Q) zJ0pssj42g!=wGEBq5tGxu9Dt)1&nH%io>=$DBd(l0tY<1PAU!}zB6FY!c4y|puOYz ze!f~xEessbRae<9&NaK{C)#j3yu)gI;kq5XMWJCmi@?bQ-R>o0JS{2+>L6?Wuhv)o z|M&sG4yAQEU2$=m)7hP^U(`;R+1Fho6|XsEjs@2-0CXV&8g`sv5Q?s`m#hL=E~;-1 zqsMn{TkkKztLcxS)ifs>^H!KM*Wq&EZld+foMA`&R&ZmHUi^>~ym(5fI`MlWG74Ud9@A6-lu1&8%aL{sS{2 z3=;-xrgZwpSblaQ`Sna^6%8zsVyzhyb7yiM=uu};_XS5d7A7-925Wf-kJ_SHXwMnR zpM&;AFTlk(Mpp7HljlscI+E3IK~`(*;N1!htAw*!=htRHy>+NX8z5!6Zd~b~=eGS- zfY*zRNa$aPZuPmR6XtcwEM8w>&Y^XQ@$IT^eP4ngmPH&F(w@OXhbrvV&A*I}4cLil zi8YTxm)Vry#I9Y7*ZU4?#?7eB$Ol(Ao`Q{FTr!HF?C@)@Md(-FakN~DFE&JHw{;0{dld7qv!p(2QENcPmLFNB@JcZJY(Zev0y0TLXz@E2>e2k zSh5(XmKD~yInSB#=!YUoYnJwnx*tWIH`bGy5DdJbAM05sfMl02!UM4dTzM<*1~6C= zS0H{OZcH~K?f>SLfo7=hgD~?8K@3=+xJ3vG-17=k+h#PU)?RYt+B9sZxAp+>D7I{n z`QSnTT2AXhhN^K#Q&bv6W$&cUujo9iC6RD-=D~&@@>GkWbLg*144zo122WY9&0>Co zpC!$&W{q)MuoHQ;YE5Hpv)hM;ZFqp=KDNKcJP%gK1E8F49*iB6U05=J=%c55X5a+g zbCK>#b)Gsb6d_%t@y<(sCgCm^seFZd<`2F6RJh}xj|@n`02Fq|u)S{SKC<@(*=}qK zh-NmPTVW1{#rQ2~+00!teRG{v|EpPUI2?#+PfY5S?=wO95plA`_ll-#B7-D?Gs(bd>Peqd zhrO;kFlzfwb2!{kgv9iY$HGHJuq|n{6%Blt-UwFlhgguh2V*FxY;Mnv4UNeg4qiJ-Gp4C!Yt z{|>~RG&ZdNf8+}09!$5~nJLDI2lcoYb$u~$s2dF78-OAecj%xAtv03z<}i-Rw+UFq zewl92;Po-Jab2?v-P6hDbG42<6VXSrlJ<#vOabEJ8+97r14+y5afcbIVirRM)GJIG zf~!(6`qq1a7v>Uc} z+0+g?5GK*!p=HlX*no`kOacDRb+iCn{XGgSh*|>wReX_=16jrREalE{WLjwd3yPK1aU;mYA_cG#5bJNP zcyqI572LJuO_TKzE<5kSS!9qUn;b3+ZUdY!7Jz2LIQQ!8W2dN8ith|$dZru%?U#SD z`rP)*sU7j&fgundaVJ`^2V6@LWPFX4 zdGY^b{J#kBbk8y7!DPJ@1;o5))}M+s5)Bmj*{rl_70%(lpCUJQ_UKnUb`j=ov+zGr ga}Y}VJ|z_xm$>jO<9@$i z&!)HF$L>Aw*nvHJ_S}2?_1w4j?AaIn`M{pLqThQ|cJ-w_x9!<;JolC2i!4_V!N+L+ zy7Rq>px?JIF)@X8bYC~{*xy(FJNKF3S^vnLPn`t+_}N?Ff1mixaXn`Ir}GCd|J-rp z-dA3E=Dx)1G4HwT@t+#^=dEA5K^@4EA@g_BJpW%K|JTs}KL11De+c{!f&U@!KLq}V z!2b~V9|HeF;C~4G{{?}+BTw#5S(6gOJ}Ik_wPUx|)PzG)A!oW@)i&MK)za96f~D&- zQhhqQI+vJRxV&7sEONCp(!*DAHA5iG=2?2#T%>I*;`Fq$QzPNv@>rMy0Qhuc6ApUl zMP4S~7zsa_fJzN{8tJ(O+RE|Wpa+l<*IpC}j+N9E>;}g=nSEk<_(ei;RWh$UdC{Iz zr_~(~Upad&67(?QG9tTWh2~gDw4uw5O=IxkYE3b`7IREfnVVZc^y#k9(T&l^35k-^ z5-Ek)uooYfW8LzM>Bb@)c7{nnZ+j;*r92IaO{-Zv8xy%Dn*nSSo=u8uOIx-BIXl+K zgyz=4`{{fzp$C_iyu_vd!A)vZskmGe1DtNspI`urHgxpyg2ICQu-9X9+Q!1E8v|og zx{1rT4kD@&v5Qu^ARNfa6cBo_HUQ&KC&?u zIq241Zfz0<&pz?jc&=?(Jr+rg+18C6t6bJZJMjpq(0{ffbx4qu*4v={=PaFjD`2fk zqlA3WBnPq}`T!dPx2AH(qZmhaj}`cIA2h01jZLh^^bGF+eLN?UTJ(Q#*+kmt&0b8w zq$H=|L*9Wwzwy(c(Haf~9F2|^aT`6dWM8G4=~LF8^0dq)dqZ)=`Q&U|WQSTjrPIu) zqQgN;EH5k#dm>+O=#h1M({|hy0NosEX<`1~Wp(~?!Lm@RWi_csw!wd=;4!OK~gQ zvz}~jz{^e#WC(17r?~6{G?sLosH}0yC{TwM$i&s-&cVHfwt6nZ1SWT0tHE%w&aC;DCL9XH1UizP;TSwai+ z$CtG8da8-=O0xp*DnrKvMQ!SZF~3I6C2+N68}&6tjYA7_&bO*5S4)vbmYJ6(sbFhI z5awnRnpD$M%Hn}>)phxWy2?7kXu6Mju3+kh*L^L(^kIaK3|+9G!Po{yjCxr!%wt7#64Zi}w<8nW4fO=?d~o*itjq@{Qq%z9@&2gL|k|z)|c)?a-jG!5$)3Yhf8NScfviDVr4}R+w9^aT^ob+sCw-P~g?l z0_@5N)#4Jis79Rq4Z%txd$m7@oCE~BRdQZ-22%o`%7B5!hP7=?ZDk=LX^P?jT4hjN z;Yxo%!X2(TRxH*QF^3y8Y@qv;U}$q3Msl=XBdxHQB`2&W=Ol7)V0>FrSDz%Q%JV}U z8QTKg!nAk3E7ly7fip0i*au~_niqwgAU{tnl9z6yh0mv3_#*~<7K*n#*9d^vDaTOi`c6UX%C*bFHyS;1MA zNs~yqZnc{TEw4%&n=8>3tC=}^MT*rF8wj$~Q;WyhHK z!=6Cyt_hcnv%?Uj*(jiY)U!=I4S+>cDLeTCEi5RQN1YI{+UoJy zbnW~+cU3_}u!il+44F~7xz3~_OO`8jfc816fl44++NQe$rPkr-H0|WC_Q3`hWwEjy z;{c~`G%k)5X$HUm425A})zhU)F9IaQT2<}@eP|r0u&XF?3$YiAwDvnrL$uo%n{*Mm z`*q?8C`b;j9+1IEBxII}WM3}U3}myF4ps&hxh!OMo|1aCtxRZY!iIV+14t>zJk!t3QFkV6lS%Qt;fnh?edY|6nUg)ttT3dFI6z$KRl0C5Egb)&qKO4%j~NXR5$&8|6|uV08wPTny) z)ggZyqc(V>w9LAInaL~X>ZM%viCBQtoom@~(%1yT zKzbYslIeDw=3)Grt}(Z}Z4O_#aLtZbVNQ$dDdmg)v%}{Dv}^!^RFZ3xAiRY?kbvQ5(KRzU{^&QPo@(9G>l zCkqz)r#nw7BnDW>6`xL}G!RQ`7ocN~I+{_A-3&y#0R>|xI6gKX4Z$wzls?kQL0EH`eAFPEVLDidUkh`>yknNFPxAh`-%YVDK>Z1bm-i>3Be!DE^DLJVdBO$<&hYf7*w$Wl@+8)atT^AT1B-jeARYQ@vM>T@|4NGp8K zS+-Zy6&-St23R*G3|fW_TZs05HLND=PMsZn;(|`d0FMpHxd^*5V-9O)F-R%)ZV%Xw zP7mY@nZzagaaMPGdd~K^(p=;fq}j`238Q8-uCTr$Z+>7HN%TMkSVLuYLJ7n+Ng301 z4`)V-S#CLQVDth-BjO9q?T{g$wyp0BmvsuNo571sMX_dFB2`bY`$ZWsN-T_c6&xT~ zSm{MLUt6|7?%-q(CuN3lRK8tfrcP-`aHW$AT}`xr#j2p>Eu$kFV-%xqYcQA<5YwCn zdpBu}soxe0gClY6k_K#OUuF=FJhr?w;K?ju>q+a<&Fp|V!>Q9k+sWKf&|(%aC8frY z(J_cH5-^E9mf@>O73D~^roTmjNsEj{2D%3~gU&ilfga7IXR~QJv!+;5VF9v^B=kDX zu%e`3dDlyBNP`+OdvH%^E1+GR}urdql*p;(QfOaQ5zrZEwEfs?=7_&2o zDVak-6oD65D9MmDco1Ls%Go$0jMo84xkA}C0A0^jaLxviY<{BxU$7+(U>_cEr6=IzT_EzdSN<7%K}E4-M9tO&o%x;B<*B_>w%Qt0KGUYj(@ z!^s##s4YOMd(Jw|0SEI7NAdX@G@H69w#33xX8Zf2Y%9$z4fh<~J!#m`PlpYB+FVI}y6LdSeggbTTh&qiV(JL$} z*$%SRz*0nxWXREwCc$BB^N61VMHAVn#3gpRp6uoQYzl={yCs9z!U7uv=y%YPdIH5^ zgx7fXMQ3hcoB~4HECK@C2g2ytBd8*v)nf)_4pCM{QPdhFHj9>`T_9U585?VBje3UE za)wDCN=f6ELBTkW))l0sfF0{yjiWTDw%IOTYxA&g;z?guSMcZ0HC__RiM<6hkAh~otsAeZ5m_QWe#<&@e87Kl|IMt3D5ILRBjmKdy z$jYtxR+i&*0C?mwOB-AZs3dNO&kp95Z}Q%2>k~VONrv5qF}=jf4;mqp1RgwGaCem7 zj9DZt-`y{S1o~T=#H^Mh!D4zfhvy8{PPWQp^?(y5pBC$G^I&T%XG7Tuf{LMJ_&Vnd zOeakiD3I8Ct{&};-14lTm?24uVYq?r-f^P>+u}1B7fRVQaQSAq_Ny2`tOwhhyd+Z8 zaNLb#ZFJ+{gXBg;e-^LCV4{cg>{v(@C8dXAZE5OkY39DocZ&3q>VY##IC5Sj(Ghl< zyiSSVG#3*ZR|y%W3}>;cJAHN*3NM_W?^Xi&Ib3YjX1-`OYlF4D?HlCd=S={ddk2|2 zv4F--RB~bo0!a>mG=kv7dfa3SEoEuhi}Y}abUW!f#o&$LddUTx+{nkX^~{{W ztumZJ#AMkqQ&H-}^+Fk!BQR?$%5#d;Yv95z zRF!jMW7tJSIWdd6CQ`m{e8CDa_H*<^?73rzyyfB$IdRh!AEz{IX;j*dxCAVMJ?z$) ztm;!+AqFK^F&Y~SQKVFh zEn_&lMsYO>=x%1pH&|V(34kJip3LcXRtlv(qjD#0hZpHPaE0YbLVJe#?;~o(tX4=FHgsLivK1BXQ#Qm&p6B%&KXlG3L zQcH6t*WGyrYp>1sw=n=t!!iZSZ4QUBL%?jMJ2RaeoqPixJ6sqhL*e=299?TYE-y!l z!EoZ8oMB4>X{8K+Vk^^lK_ho0DE1keM+bE?H^(e?1boFc#~qdUTNR0T;gKQ zsE8oS7=u(Y-yKU#F*LdCe-;*-{B zp_OR^8oE(2Xfkrau>h>8GI!b$BB*RrBO9aV2r4$~N(y$R5#z9RTCtcDETfuN8B|Uu zK3h^X6-23;Sf*eS1H#DfH<*XJu6I z-#7tH0RflT42B@nO~sZ@Sco1~1DuYlw%-V;T7noeK8rMQBbI~+XM$`6oVL?2vWK;X z)$%aw5NtATy={h;q$6)G_!#g>ZJCvbwXt(fz_8~ExzmmA;qr6inll;c+wgEky4PQO z%-CwUgi_IWY+Jsf4%~G+GAL zjf=a7GAMVAE0%)a^E>1UNxLh<-0+J73q0&4jLFx-GC%3c#C9vxVbR{=WFtcrgb;KG|FNvd<<8eU&sk)2ii_~ zX5_q5ScY?aIk(nK8slM7N?nrCSSEM_<0e?-P8Q3J5DQxxInxAtu7xLe_IrYarxD_7 z_FPvPO-sGXSyk=Qv25X~gvdtDA8Jq0g$CkOf8QvA#^vry_AFOo#zi5lm7`-1kEKiO zv`hz|$7!dafV_816| zj(ixS41mGT-5o7}xM?G}2_WhwaJbnFOPASdUE$*oQI()NP9GzN3Gr)G(&!|I$vm!% zj1opP(IM0*yIny^8 zCrS|60Eh=oom%Tmr_0lNdpzN_2TLa%vBBIBUtOk`F6Eo2xu%q>DID{#K`L2_XF`%| zj9L|NN#x)fDdu5<0iA<>PY3{EDIzsE)@zcL8?aUGY-ecl4AriM4iCw%0>dm5HxoYv zv12WroqZ@WkR#N`(lgx+6 zQIy_%5ImwJAa>7P@RH1@LSh&!)gCwl;L2bzU~-+InTo5z7_bAbKA{wd7xl@57)+q` zmakH55*RdYrGTnw)k++-tc(m``A+zurc}LmUA2H%9zR7un6HyTJYJZ8K#&Nn6UPTM z5}t>xw+2~6#5})CT!zM$=-4wP1H0ZiN8)gXIn7jPwyzR1gv%~E5wcixy707YE{G`T$iV7mvs}eS*I>2Sq@!} zOgK>VhCr?|S|)4&&Tlaw5?+JN0<_q5UDxsETY4+jzpifN=+i?P0A_nCAOjFA3vM}@ z5L3doCKJ7ZIM6MC@_;ycfF=jlDdUPl3obDqFZRqStC5q->QlgIK!j1^3;&Yh7OcnTiL+v*j(kp( z4YrYnCK@QiCEI_y-3e%z)7IS9ZwesuiJS#Oio|AS1jNnq7_i_f2|jKnnQ44N z1SNrLXPAwvrzKW>5t|`dG;EE=nli|B*c_~trBXAYWA$q5L!#RQ~3!AB7d>_+Y{nS87vo?4z(j?}rNiO^uw)l`7tmINo9 zCc5(03q(c(x zYF#phvmQPQjoMu?gp}ZRcFx_uMW<38G0)!Boe?AD6Y5wqWNwpP)5&t7(4nFRu{ayF zIglDDKrHY6bcLA!8U;BlXkLw*z^80t&m9E2y_3Q--lAl4x`~?X86D*1SX<;lL(nSrJyRI_V({4FREvfvC9?o}YG#r%9T&woajfvXc?V zw6A&W%WdSGKtrY<+TPA2knH1`CQxvou#c%kSkyBvAs|pnVqt5lQmDySgWw`;8nU3$ z?C1`frS!3kbn}?iH<#`kfa*=NzOZBUOu$IZj%g>EReA=BIAjbKlvWPJ6Zu>SSBgRE zMmNF^07_|hTd-!e_B07&-fe2*5^z0~5;|$6smlbxDASpP7WUj=j}tjnT#z5y&K(Jc zh>8Awm5X$&ZgM%BqzPxRSY7&z3*0p=O4khorNIkIN=wENx?GEuRGn9vZ{c%B+MPhz z7y8*jPccU*X+WN@!El?JLqO1GR#awo(y(SChRdBj!UPDgI`XJ&F&hHiarZI%ni~6E ztIdiXSukq2mH7yTc-weCp9@dol1U3@3A`@!%$H!Nn4{Jg6+x1B=C<99oqe)cHZcvy zZX!zLxMY5X`J^7KOpL~x6^3;h)-H=~by8$)v3flB+_6dvDo3SjVv{xp?mWDcpEpqC zWreaC8+9x4b}0u!pd1@7Am$z;=1-17Qf$a`3ym~;4t8@yk3JD+5+o;v>#t(#ipK;J zo>41yTukXn)FSRJrrBF>4i9m(P1>wf8d9fjQ-^TV%Z!z}P6ptaMVIuer0uBEYiy1Cv`bh`dQ|3)2@? z+wkM+-S_?oO;+lmQcy|&tn2FJ^Mm7T6~9HMz-ICYg#pK~w_i9?#6`z>{R+9|^q_8F z4$TGt;siXtp*hRkJ~t@gs%R2JL4%x9eQcak$_ldUYY^l7-Fhz}u&%h8of_N0OX4be zfE6}GBx}ut_X}kL9H&X=5wexNT3ydcVPLtrh2Pl^jbXIIyaJD!F9x7DP)wX9M3ZcC z)_iR49n`4LqVgfvyFCR{K(Il_d!k{zlQn_{E&-d=;Ekqqpk-$+0lVVZ#(|yd+67O@ zw%`t1+L+UbJkRMsorP&R(LT&CMTB0sYU`#BS*PPAmZ+N^hA&*imAaJGM(ye?%dLi> zIulJx@@@+owQD}f8nK$_Mv*L%&HhFMxhXo#pHlJBebe;ZVnc%Sa?g^?x_*{2t>IkVglH}% zgT&32NGfQYQ9FxZobx*C@+&71FMr)mSY&u;IHL91u$8|W3pJAXOAVCL20kC5tK~GW z^)H+N6DrP~+jX4k2$r%}`KvAICp-h&L_ys|fB%A+spK$6$R1H9ISDT$fE2mOH0alqO8b&M9aGu6t`&@|t+?$-3RB>(@3P`7t#&g%NdqVlf zo3jKy&q`EeCNHwKpZ}uT&le(!*AZ}5Q6PIGSHudcz2lMh^TJ<0Dodmgb21WW&{9rE z&Oexig&+V-8aSyO%ODTgEke#t(-cB;1`uPX!QsbNgtY8gSq)K03~;V9wp+z)5Vc0m z3oj^#z}2-2M7)pFmDM5G$*=cwJ5?i4(n1%B9mok!DRkbDWlil8H(+|@j!JlL-F(m~ znz;s!lUQ#ptaf$IsEUH{Y`nkK9KE4DswQaQlqCfiy5w@n(V+!-`tW9*r&BD0YDc$s zw;PPDuGJ1__&rZ&*cU3mjz)YDc)oAMiCzC!J32c!8YL3*u&vUD>cDHGY}Kfi@OJXW zo!i^?U{E7gmql%_GZ(0ZF3z9{eXCvEwPf%LV3iAh)SaHxL`=({`De;OO+phSbOj zc61ClA*g^#8rpDqx!Nr5S?jJ3s|gYD?lww=6~+J)qs33=h0XqJ5`;Mx%$!R{msn{z z*gR``f*ui>KLo@JJGtBCKA}ujT_7THVY7^&D7e;dq4%VCDqRo>$=*QHZFP0V!k9Tv z-&!^~6$H5emj+ZAb-S~v!*IBzr8x^>MANY4Q}Wb6)C8;;3##2M{qZOUp|ljiW}`O@ zU`V1dzUYKh?g(wki4+ls)zaPJX)8+&lCnbc+!no^AaIVi1)3C#S+pFxY&K)NK8Z?% z8%+3fS_0lhnh<4n&|k!9=dcqUz z_s<0_-jES2qBSU*&Y^N$)3KyYaib4o?kAFT zvKpHQ6GNm3~y7b#3p576!r4s||GpdWE@nAm%xHRxQn3EmH#p_X#?#QHTTDrLF zX7;VF=|r8)5^EzZ(4=nki86iRuI9EQ0uY9sKDUvPj!ULEkdw9Wyy$w5Ar&e+;%0F+ zTN`OY9!K^HXSl7!yz&|gJ7^d`d#9CZ=nry3wRL%Z*I4s9R@6eX`)5weyzb5!^%13C zVo-WCO%xCN2%cCsUywV{r7j_Q7BXnu>-dpD0lpGD-pm*d7DVl+46e{yi?O50i8%9F z0T4TNO5kn-!<>H6DA}KNfqP3oSn6|k&gq8=Qr`GAiU81sapWvo2N`v@)l-ErC?(o<1^{KuAoM zxT$G=(=7LlA``ge?_zZ*(;P*q5N&pJ?Nxd_>!gM$6wh6YTwyAD$~9{;J9fRUmq$QO z2v_wCsGA1n9wAtxtPz6-Rj(`XYAn{H#+t9;rx+l_LMId!V>ZVr%0(=WtJAy2y-i<2 zr=(EiLtu-WDj{&&)Ll}^5WUpUG(wgrO2yjKQk;8B1%=US&S}OqYFVhxP>EoTq$>s0 zq9ZsB*MTBo^j(bzZi|`hDGqaFOoU+>9MA}LXJiO1&)5{6qt>ic`ju#>f2+Qu|v`CANSIosuB7}rr?Hdh>_fEXpBa*77xcEoHWIoAOwi9Z7`N(eNy zG>%};vH6Su{VUuK#}QgT0s}ak>Uma`xCDSlE;LILlZS$wPLvG8sjqhP#ql1Qi6Ki9 zln3UcUa6|3nV06KZBA-c)2_x*7@}Z=jh)JrRl}f>aVo}q(rL?{1%$Y|?b^JFo50>B zS!_r=R{|bO!PpF-*RI9w; zQJWR=nksjzx|@~)3?i1fE*R9G%ObjUQ#T|v+EXz=ZR@aZrvJM2Bfs5K`y5e)Ln=&5?S5@=~`IU=z-GQe}r5CiMeu4YKBB$$Fruqzg^ z!B;q^5i1VBiLt8!%rQd`^MVt`a7#~8c3WNlfTj8Elu+UtCfo+?X6DwyYw7JI|8C zg%S;?ctg5fH>;DFSW_rQ>6*94XHLUpM!SgWv~{_dXs#G9bYts$9IBDu?b8Me+)a#B zCYHnDaXY9OJ=qB-SuJ{UDdL#OwGCnk9Wza8Ibe>EX#mpmj)`Ero%LI}Vrj2Dyv!?i zvpLtOoKq-CQbme=y#w9R!9$LpA_cliOkAS30jU+=A(9C45eR3d5KJ#AqxvhcSx)oMQ&L0)d zd50$EAr4r8;F>gqP-@swDt5fVfKp~{L1BqPs352U$3>I8@@!yc+TymDX(3{! zL@bqTEK{D?yO?bcmB11@5@5Q5(kb2&#z?Un*~F`h#rfl8Hz(>2r(i>tgcI!Ga%hAK zcS9wZF>*G&!Q9dv#6C~#?IGx8F~o{IjI~RvtMzEba%WH-D&XpaaQ}>_$Wu@0EvFzj zLYKPUzWw~&Nr|HfH37&An9YNmt*5pPG`NvqEE6mly3Wi6{h1jcyIEiyiNUd3v@JOZ zOM)^pF*%@FJ&oOTM^9ORhbipLAd+*k9gDE=A%2Bl+Hh*>rU}<$)HH|JWLj@f7a+JD zd?_Zy8-+|nL~6I}-3(c#JhOC)70vsU;o>?Wk=L^nz8oG1VrmO8{$h*C>)l>zJ9A-a z4W)!fPvYoCaf`NMPnQCriH<}*vAA%GQcBF7ABn|P`w10@TBQQ33mDC2GTqE^Zrl~sb)G(jJ*(ZS4*+rm?r61FbRm%IX}fiP0=u&oKb9F;qQ^mi4)5f= zi(A+nb-c`6_ccTTwxBk5J$Tk&h>1%@1;h%9egQRiV%FpvK@rjdX*ypKrY{Nhs$&g{&Gr{nlFSZyo!utWDp$zUwH3~ zSGE9{J>HF`^ z9|Y|(XO@rpm+E@{ys#xy@8<6LN@{%T{SK+V0%BJ<_zmC~Op8iv= zq>Wi?s$T0VsnFj2RLht994oILlJ+cjdV_H{-$&W}H3;Nw`0oA{*}{Dv?w>Q*gzi`F z>b#rzC2Vc&y;Jm(+a0hk5XlEVDNH5qFMrZo_n*gVL%;6}S;JFZABQySg&b$n)GZ@j z{Nj09-78hk4{UWm_7(brzZ!@abK*bA{@~lEOB-r7M($;-9gNM%X?&ybw`o_m?US8> zNTBV7(ucq5w_bX+;r?GcnmC8V#e07FEssx^r`gVq--fSlL#^G0f(cOEa?Dqqmzh!><{N(xxLebbqZ9fI6fB&rd?!XiK zGmh_>*uP#Y{BcaF?0^ZLVb2Hf5Pq=!$tx1r4+Zg!uYU0L*Pms8=GLAaKlk^UFJ3+l z09D@^|INsymQ%_*KZ?5~mLL7=(A$Qr(q|ri=H=`+x#sXW%>hYkhmQLvd~A6Bz=yq` zs7&`fy#HhGdpi&FU?sl1f1Uv39x1rJ_R+Y67do3SxZaZ5?XnM^y{lp^d#54mvh;z~ znM63n%w?jdDSo zvGos$mR|==^bI|_tTwGI=?q%^^S5()F z-+86*;EV8k^Y*>6j!Kljids6{_}f=S$78xJUl;xO!j%csUsK=fKx@IL?!MG4=9IoQ zY3360$fm{;c=LVBfBpORsh_Mpv*Uj?J~X*~k@{IbI4V24kvi;siME@ zGG5pJi>%BSY^@qwOMd?R1mv{Mi2^U2C;1D$|Nh67XHoGlf%dfwc{>>+bl6*sw}cP;HrLwu#VHN8k9+&8 zgWU$h%m4K!m(4zY<-2E@Yeyb<^h5*V+x~^IH)lY5zCZKboyC`O)!FYf)TXtpf3$EC z^#x;ppt4^&-3WO2_CIpIx?A}7tGC^LuXgCnZJ{S4scBVQ+@~-7&cE3lz5)DfNVLD< z0r)q^Un+g>_fMe9I_X~n*PaBudB=fAhGq_3d+4oTBzV?b_u`P9r}10umu>((6+C9= zr@rsrY5(H=8c=WlJx@=~G=KJW;PZiWHc@s6`^Wg0#FI8G9Bp{(uetP&PkLVXsPm1A z;GVqekvC5Vj}f1w8W5Xg67H+qlZ%UqcZ?rCUGi{z`VXMfsr&gC7OsB{+i#TQInC#2 zIfmhi(8qtbZAjeTeoXlOtyIdN`}V;eeB)jJmjLPTPfs|WL4Hso{HXBGnhbCsuju&+f9c=KX_u?1RbU8J|zRKA2o{>5cp17T)Jq-u2HN z^_QUum%zeSZ!gz=wVG z_vGxOzPiLuuHAXzp{`FsrLBKF|KdA$y^bFk&Q+{GSvLDI?VE3y&qR2UsR71Y_czSn zdDH#R!-J2l7>~UdlO0d0{s;T>)Ary0P+!P>wE7eA$p?Pi^UI#vr|%0sQTZ}A7E)c+ z{;J{P;R5p$=O22~)2{KJ(}dz@R&xa*M3_4N);s_Fw*ULZ`ntKt?kfH7Q<*vM96mSC zVoU6I%C4Nue$xY{>Cb%C{qn-$Z%Z!V!{Yv8^liO_4?o@e)E>vw9j|_J7|Z(;lK1gH z_g{W5^zzYP(#TIH|FKs}m-Hp=Rl}AG%dr1_s_zOdXXC}9cz~j%>i)Z-U;Sq;xh8*G`=S-2gxEvjnU024utk`S8n4)!sdc&+NF{7f%OKS+tNcCs7U4kA-To$-u&ndp8sa6g%_uzg!y(oZ{lCv&!}5k7@V|GU ztmyiIegd4Hq<<@}#Kh&)K9M`O`^1$$?sE1H*RGW)mHqF3 z`S@oay%pSa>ByhN?vs3@c<`g;M^9a9vN5eSeP>8y=&w2P{>2*x>0n>S{)zOJ(bj3( z&&bnHJ^gP0_#uv^@9BJx@ONFr^7ypt&PT^fN-GYy+E;(a}Mr*=d-sjS?|Wi=}wYz0qgZosq zlPzC?b!ScY-l4v{iD*Ff-;O&^uf7jZb@W1!IR2}rj;ze|RYQAjpBH6aQ8!dWYyKtt zxcgTnf*D+pb3;FTmUVXam)(tOZR6CNk)NMjAe1zHgcY@>tls~tt+sk<_QEbV1Q`~` z^4?-`4^>t@bQ|G6yOXfn)Zm);KacU|&?)2F1kw!+Gq*(`7gaBQb^D0?P2;0W)XSe;>bUzE26$ig{a0Ur{QSyU^MSShYF|%P zPH)=&efFVqg42&TKl)X}s~jQ8VUb)1N-Hi8-M)z z%+n9QoppcdeP11kWpnpF^wLAh6Kl`<&fomLRnRkx_dM6T|Dzw#AE#Ve3p~fU@LBW~ zT0y5k3J{)sZTeB_hwmmlXS)3$>*u0{vx+Bjv8`pl|9OP@kU#F@!f%`0zm@&xnWT6Z zCI(y9Q~A;Bk&l1;_No3(=*{Q3h(k}EI`wbz=hDX=d+oB1pC*Snm$6@gTn0ievXGpy(&vOJ8XZbymc%kGFi^BPYH!apg0muP5cW_2a9@Vic_D`$ti?Cx5pO zKC@0g_UzZUZteCov)7*3c&v1^WAV~^+M$0+zXc4vT2?MC{#Ki3yPxf?WA|Kq6*#;g z{f$61_GKQX9Lh|Y(bzBic zt0?7;ZMD;;*y*A4kG2%7nLW>c;J>qCTP!yN(&x>D^E~Flo zw&xTWzFEJhyz^Ot?ysgFdx|4F_kVV1mIM3cTvZ-;k(R8(4FQhcS7ZTy{o0VY;q2jD z#}jIVy7|F*IX-ot+G@=n&AI;dwDQEOFMhjnaQhMV8#lyK^CQg*bKkxeIx3vzzLTiQpM#?B!R()2tUk*&Y5hHaUHMPtZ~cV`_PsCPT3hWEjPCz!{@WL>NseCT zmPuvozt_JH;_okQxx{^A?ESqjfsn!Gxto2E;k<=33Y9JSX~AcBIDZeYs!H@7%VC|^ zd;QYQ?qu-0zodP#CS9T6_CL)@x}$%sX^t@q(|5;2s?BkuO&`+>f zNaXxKN^~g_`o@Q4f4?z?>a^c;;^SA9%J6rD4|`Xh{KIMmJUv?Ys{M}}n||8E1vko0 zUrH{#V$yKbIBQ7xyQ_i*3o`269y(^q~EwX}3SH`8D!JNo|sIY7q0(HM~ip)pS_j1>b- z!Jmn_Rs231m4i9)@`-Z>eG6K*H3Ifb)wj;yV^eQ)Az&oM3Q}W=l|?$iJxU7ZG;Yon zPNpDlW06{va~42o2>yrGecm=AI?opjmY-b^Y2fE3+2<0%^uOs^=wYMx^;)fw!0cmk ztEC`hDC2_n#QdFU7Pr=5IoXRXETXVv={_kYE66Bjw+e{3Tqc1^!#+ih#i`%`sB7eJ zXiB{uzg`eG9#)99H}B+SVfX5;uOqWR$$M0C4pqaVYvE8pgLSn@n86bgI)1$V)lUc~ z3hscpFEbsmx5vC(^9J(8AnH=_%de_%E0VLZ6=$4_?-fFn3yi=15w=}9%e$gca%F&j zJjj&F-ods~2e&s0tl-zDWR~D8mf4#e;9m2SK1tOvX0I_^FZs~yY+@U0?htM13Y!+PKG#Pjd9xBDYVhs&B9f1utxayogp!J`L%6>-z%!tnU&vIQjsP;DJM|e zadcNWyjx6YPn7$y-rC))g#tD&4f8WBu=M;YB8Ox=+fYP_W!+@`^gDJl{s&=zxwPt% zN4^+()|!Ys{6H3NO9IRD;8_3qFRe^-S=d^*P@^BK3qrT1AuuCFFd*y48#{uP{#I%k zgB(_RNIK=(obTCle|-sMf)40nT+=>@p_H9k=2>t5LbHbeo(pn?B>-9`MB0^TD@Nbe z_b~wEK1hGh6o=)XXeRnHt=q#YV^`8LH3f2dI$GwQXi`~41cbMwL5{JCsO!(`-m(}v zeNPfYF3PoY9Ka21Cq!+%y=dR+ER8a1kb=le#n1|N^%St@Eel(S5)%Q9XU1yd@J>5YRaW>I&&BSeLcrz5{X`h8U|uiHKd|RDC;43YSNhcuycWXQEzjYesK~HL` z+RbM4RP>JtIQgk!d*3M5VwDBKpgvj=b< zELw7N?|$k@XcoTfDq*Q?1H0LY2k==y9e$oBoTP$R)ys6zv#dQ#q4nYywuZXU)L{1* zba$*n82;iCr@Rl>`Z@fT^#QYUwFS}U=`s=5w~#ext*iMbSB&)Tm~yL@21v;U(nLz$Uu)5@9oYb{O5I+PsRM?vP>@noc>dHe^pO;wmPW#6) zv7IJ!Coo&#z#y>ZXP>0#`=W%tb6>1LIb2rVgwRUgNB|eo5~C|rahy%q8iIqHL*FO6 zD4MsvtnE`E`DVZAOz*?}n)+pRjx=X}V?IP-234rC-JDays~0{ZBu{|*t4nb*&W0or zerLfhaEvIS6L7{Vs6^&W!doE$|mP_;FaHfB~!tMQdrGr}(1=mHYA5_;8V;S82zrXu*itllMoxfxixON#tBWHT^vT zT<>qPjc;R9=~rM~pG4RWMkqLynqm9LT+j^m)Zb_A_OEl_fbfr++ny+zUb6R+!ISWJ z?U*~HzE9lu%-PK)`O|eTLM1E+|cH zZEB7KP1^<7?9~$ICBUsD|J(m*@|y&0XKFrs9EYe|*|R58Am1OZO!|9x)l&`yo2)TO zD7nv87;W!o^F1cc1x;>`G^lJDf;Jt`p-HR0&uwC)K^pVKHFHMKSEwYG7#duGz$(8$ z-b(FxRTr|z)Re=ZHfLlZTxmf$sQ}^%t7NVV*k=-oIgfpaQ~{rq&g5Pe!hlFjRgb$>)}96EO_78DyUf ziBufaLf|%xSO_>xR}wL@n2Tf7w;(Ix3S2s$yy%Sx>@(4(%vD+aIf=Fw#80$p-e(~r z2oN%X+P9*m|GXhwn!+*!83c7z%lsPFh=y4yk5X@gd*YhZG=6s&u^d25rjsAi3phXx zN&LFHW$iOuftLbhSIqhReO6Npm@WNor~`fmRffzKwcF%Wu*6xoR^IBj?w=6?)8W5C z#w#qpj?+uS4x72+$8MX3iS!Sso?fv3#FtPP?f6%`Dqxv{Wqjv4RU}Ma7977m&({mM zyvn*4GQ1K`Kb+#1;69ocFmU8eurQnh-k~27l*TnU})_tT*H+ zZYVk=)8I1At>pB7JDs1___`W@s38HCi_uG`nB3mw;R1h(!QmhYS*204X_|VA3XSNl zJ0@NxGBhae*+{=}qKt_(2lO;*e8npj!?iEZ5L=E>dUo#2c>5ASqjq<<;kyZGbazL+@#+~J-vv{{Q3XoZ)>}X z_LUaIgO{G}o7o4)t~a~U>^TWiBQCN?Gld2*X)SOM_4zCJH*RJfws({If@Q{4X%tx0 zJKz%C#p8pl3QNQ>6%Y*%AtU|{E_t_+rV{h7iVae#aKDYvHRt07L>w$5zrW!|bej!a zKY`9Un}pUzyH3&EaXiLLL{GwWvJQg*X&;lCDOCAj_mBZ*g?yLzD49H=Gyf=@aCT8j z*K*=O`ROR;T)KXj1K2p(qfPCD47dYvDf)m5yjTLO<|0HJ2mbJQ?Ib7Ke{W1Rfc=Jb z_jyskj}8Vd^HoSwZXdV8d1bl41|kT=MLH}Mj4Fg#oD?4<7$Jq_s)>C&J-`(Z>yrHP zuRIqCHKGeYY&nBG@I`39mg+L5^59XT`y0#`hAM^Rf;x14uG=`Ye%xVRue>#molMct zr7k*>p)!a%#)$*GKrUIU@aV51n`!q`V?w9PWaHt9lbF)rYmRR>98+G8#Uc%$3JT!) zpowT9q6wZq`!Gd>vb+`KnlCH>@Gcdai%!MOi8B*isiJ{q(kuXLMWh(v_V9AOvPX)- zmk;Wv5Us_b*V}hIEBh)~29^DfL~jA3@h~s;^kP>_Y@yTmA9h_d3=l`Cg9U3_v!78N za?x?r1LqGR&7@TiabGSgrQU%4ZE-s5_t9ZoqAt-ZJvd8BFEmLU1eIo$SDmuhxL}a` zc8Tz&sdezG{{F$sP==$D`g*?)YIUR13aoVAUJAclIiqguL&GDP!`#x|Yez*{3+lm` z!$T<3m*7*7%Rj|hA@mS|R<@ZX^TLIzO_4?2|L4bx3ENRk1W z-&$q-K*LV7*;~6^9C+YB_&PvjyI${92ai}G0|KSXPSIvz>G!FN=tPPN^*xIb&hXx6e&F6ccF%h_h>J<3`1BV=Gz4>VWV`PPIWeLDmo`5r zGJww;bo8F!I>UIma6Bm&jd|I)e8Syy1XfsB1a1PkwUq{x$zeb~JZME4>j=%XPS4RpHes zoWlESW3dZ71$ulrf>q6r56)IBtOUTd`x`{x>OH7UOP`{7#BVvQB%WQe3IR^tAO-A- z?WFKt*H%pL7COT@h5pPA*}Py+^JUa4njqRlCj(AUB21pe*=0lxqtqJGHVak5atRNR z-m0A5 zM;k9_I_{sS@(Mu>ZHF3oLfn1tHW+p%fGLzc=TlI{{0nG zu4(NS=4T``(Qi9_k6!-bzqlHfvBh6Mwm`z9W6}t6!rSS$wut3|6>$_`S;ThCA<3kI z5&W#hz*ed*doA~X0wTjFz^exXmN7zTkomRAT;tr|kT`4ggGfZ4Kc#@TYtDqyYNP?{ zeqi{=cSnm}ANcnW&w!(6uAa3kqPcrS(*W9$iMP3JQ}2h4`{LYZWh7*+(f^nK@bZ{T z1uiTKLy&M^2E}b)f)=i}q5x+QSK>pA!fem8dscc6EEq^_{mKylI?FT96}DVc*6%X3 z_p@Yvv_S>fuGxKrZ})<8T>ERO(Fx+u#$>N+{~-*>W!;lZ3ET6VO*eJPzH}`QuX~Qm ziL^a&|3b~~`c~TRgy9}r@EIn7e@J~EO%z$tyNsD=Dqr;N1SOQe=qb{4@| zKmbB8Q&}>222fJopn}2*wS~M*lla$v@l)_!fEiMQlf^b}!=3DN3sV5x%Y_6O&834{ zo=*eF@j@e{w(oiC$31g7MManU*o0i+w3iBFg>6}HfsCbK^CMsymPK6(^j2+XA=L0( z-3GvET|u6rVuJOGan-H2S>MM9$y>4D#+W6OvB>$APJ{^dSc{LH`15^M<~?B$61+cW zEfHFrbKB5Ddlv9>38Meb`^g^vPQ#Y0w<_EUda=@?FGMbAS1rhCTE4QLi?p23*p7=v zaWw2{I5ZR z&y|TF@>v*t(YN@!O^p2#>A|rJ45vbD@jbt3FkOHhklitEU>y!b2 zOqANKirt&eRzPkggg0LgLMYM|5RFn<_55=(3Hp4`l=Jt}VC9t`!v!MW$8FL{Iyj?M z_E}D6{Ab>rgy-}3t>g^(?|8c?a;+p>an~vMfQJr-gOgy3*j8*)-1r0pzZE%^K-Kl5 zUvlj5XAh|4-S^;(1D%7oUb4apPxNjq!MWkxhP3^mx-VN?BaqDufX_NYCEy$AsHq5OxFJW&Z zA$THiSe?Z0Jwv;7CP`GF!8+vunIVkO{J3bfkTL8Dd)m!QKk6cf87)HUbFtB2Hf@fF zSPG&^A|xE<=3khPO#Yg=r5y)idyHE1Qj+1sgsi5O;r^O z*m~O^HDZgbGAM=rIk+oqhG8JJ(f|m zjR0E%+=!nHO;evW+Qcsq_hVq~ZvcrAD`YQN!^1<$47p^}vYf-|R&-_zi z`GZ%CF}J7(MPpd+!_r`aSYY8RgkDYzNqy6s9Pq$^-1~2d+C9gQlja2H0#%^V7a3>=W#hH`UeAQ+g!wd+1V@J%sc&- zCWbuY&+6Qz_#hV|5pP~{OWa=#9s@e3Pe{bVFTnxGH*2w`1}tSc^+5wp&*?VRG?wkS za0EA5^e6#kTOaQ=A7@`|PHCgBxyTntL0~00S4K;J@odlq3v_@u_5Seza;k`wu#4?h z+Hn8Wo%U9^2=J`a$d;#^<==AJCqzQwFTfP%&^=$JD*>M#eOUd$!)~tZJup|;}_d;0S*PM6rVwv)_@>XZbG`jWziKo)k z-E7S#Fh3XNFtHL;EdM0c*dFwKfn~(eaNz-Eu0EB0Ftt~gV7yU`Vw}MqQ{X&AM5MA+ z32b-t4SsB=wp3J@EFJ)?IH3ewJUR@nV&UuNOo*^qwS+cy7Q(=PPuBRAH%E0Ts_pzc zKCD=7PksuK5~=KIG(30?tW>JvENagSnd5opgKFuRI?%ghdjF0Hz+3pAzn2=Y*BA98>)0NuB6J`EVXD(9(TQnsI3cCa0-Hyl`!MO za-1vzOtfF&lq_^S^e;PN3$C0+IwGpV1=j~EJQP`~r4|lh2DtjUU+Dk}ox+|;oOWme zybqtljV+dmiyLA9tBNY$ilYqQo9EaTc&cHXlhnK^`Spf{sV}IP!yDS3{hBD=+N!wz zKrN80%PJkRIDf{4fl37DaEw*&;bIdS{d?9-rP+~wf%nuU2BH!8(DC1NO%C~63s6dY z_-@%@S!+rz;zpA`pHjTxgUKovCn{Y@Kp}OTf1x}>tc{335dYI&?DxU1tdELaH-(ni z>PVf)9~`uZ9pyhNE%tM=U29{EeqFo0UNHWE+JDg9@n)Uk(~VE~+Jvux*<_urQHj~p$kdLC)7 z0HcOFTjlnU?Aw2@Xvy4D=;`jw8w79z7C(jSMTf2pugSWptk}n`>q$0caEoArBRmc= zZ$&L3o6!V>hp>`>Y3ndJCUJGkm$~nzZnKgwQ8x4e5WLMoq_2QrNvhBW@w9ROKNZyr zcvW+)DGfvqST3l2G;z^c+tqWl7_ z7Pc)WDK-cCN?aa7vblP5%^)8DKSJmbh=3G^U1Pq)N_*Govwb@Wmp{D)H>=p~)9>62 zi-6H*g(Wwd&4;C7`xJ&MBk{!nYTd8=rm>f+M-OgShCc#%r`uWcJNspW^Qrwpe`jyg z+1bxRK%x(VNtB(+xcJ(8JhcA5#!j2V6+gSwawq#*u(tmG^s45YQHvknsHL~c z1V(Z(&j0TJNsCw7zEpm6H(Y_<2PALrc6~pWq`^SgUsw)%YKlQ>+5|VyRM^8GZ`re= z9J={s#Y)FvO={b2V?VE?m9FzQrZAmp)IEl5$h9b2kt3^aLSH;!xM;#tVR{;T(?=$6gXvhiyzE5s5KRAx( zB0b2ql2rc|>YRYEwhfx8t!$(xaV9OXlh|)oB3v}rlMB!$Dv=H!BE2!GiLQ7;x*osc^ zr@8`@=X%N~N|sdtz09`yQ0z~hgDx7>b@}`^fEx>F;Z+F2uM=fvi#D+K=ivcS7oA&_ zc^&VXXqc-m^(qbgP2}{k<#38z%YN%s06LF;5@+FcY) z*3^@~V~!}oGt7G5{E;{Xx#Ew#gc3-$XHAfS=&OIM0tv1l6t;{8tE#cwmn?!hQB?E{ zA9S*u0z$$t3sX6t^QyG&E`^)BWRj474oy8T0|(~E!FnRB#q10wsNM09 zEwCx)e^6L$l~qU`juyYlBU4zjyEKjy8lzq$=N_{CjGE zobES;K;pBLjurs?(7~U+>V!{)!LIxAZ%GvgXY*jlK{uu#eprkp^rSQQ(^+GLWFx9-L?oEU{xKz3XASPWuSeCRx6 zaFL60DPx(bfpc1XNWf?R2*b;sbDq7b_@AFg+9yClB{B_BV&P(NL88(;`!U6lot2BQ zCw*uFPznW5ai=!IRQohQGN8n%K1S`mTZts*N6H7-Zwm(lMSClze_3sjh?Sz(&1z?fA z-a`J~?17PnQ`mw3svM#jdI1E*jcmp{<7@K1Z%jE`Y2SF6xbm}6R}HevWMUeK6+z&B zUnr$VJ6zLXCUQd;r)t*Y4Qp$)zj?52##vki17jiarvZ4)1(9EqGz{dh%30s7T4Ugh ze=e6^1Z3I&oz6b?e3Uoo0#&4$Z-ld37HYCmr43HjJgNdX7cQ!vjd&SQu(3?bf;{VH zY4l0S#KSL;l&CfgRRC6c&a1+1U7^Tm0<<*e7sbb07#duvVCX>r@IP9;;9;0HjE2k$ zOy+mxD>-SS@%Jo|MI1r`$2B80q#9EYhD_0dkuOA*q69x)Fp=;lX8~0`Qu+rDhT4CV zxAsW$LW%DjfV2(v{kR(*!0E#+O)fSY0A9r%-UFui*d(HiOL>msU>2Pvu=U0hY?$yF z0lf_4YuWnIA9Rl26&ewh-NGHL;Eg%=ZV}vP2lv!gCXdk0HF#6rBZ%p`o|}g*oX+X zP)V+a#T4=fPzm?0_6X(VYT}nhyzaMM_T)kZmy0w^HV;w&**V;Ep`ioW<*MwZ*B+z5 z=RDX2mS9kQek$_;#aFAn)sMFf_nYjb5Yz?>@<)qG=N9&=o_-h3y)H?5oMqpV+bMi0 z;2^5xOIBm>f>|A-$|$dk6lJmV7cp2llfwP^F|@OD8ai4XvP}h#VsK%3xT*iQ2Ny}F z1Pq$~3wiiqo|WQY+4XX38!W;b-c+FEB{)#;R;lo&j(ZFI=>exE7s2X6+dj3S6ogcH z=M{8)uk{MGHpRsZH%N=~bBw)pD1zsYN)TnDDoidMEB+|6%0HC6CJ+}eJ>R4nB-5#~ znReX>;{qPJHoHq#=#5OSoU<-BLMlyHdvOuf{R;$TxU=!Z!cj#0>p$;#7AvE6i*)t3 zma;2IdLQdg?fbC|$hEpUuODmap88!yk-GBp_iTSluC4c!5l$)a$^KZux=_xL4*uC@ zQd=_0zb_$6ivG^k@2)_K{}`#340~&m1_*wuv-wV&=x5^o zQ8UXU$#mI#6o(n~IjamKc|VN{rtM1}y=FamwUv zsGbMBPYggw-Nq{Bd>nuDDZrqg1z5LZB{Yx%+_>@?A@x@k@O;WWM>7Wo$oNd>icws( zA$+{cW17t1*VNnHQEY#oMYynn>YFb&k#$h z_WE%9_JT->mo@cc@;S}(>Dzw(iuH;_Zq!x9L zwcbvL1BT)v&5N>Xs7)ooc6#>M->K_zSu(yKE|D!1G6Hcc#ee;Wp_pjpz^&DY5ezw? zxLd*$pgGBi+Z(zgy+m;+hO~wLR<_U=BwSNlwA;MT_t6DQ`TN|0kS&Ch<6e+^21wiD z)>b%y0(P{nAk6tHic{OJrM6;fpa`+FH3|VA%p8!bgiu^tKwzY4_s)SFUZN9IM2vFM zuAtu1Eq>owk|yoOBJ4O7YITK5s2HhWpZz{i^kv6svzD7<6t5MM$ zRJIV1PDlkSfY>uNHOCsb1HH$r6%GDEl<>2fp1;?YVDr2xVYgSHZqVe`l5c|u*N>-3 zhpy3&=|W+{)jpobCLOC*TXb3Qs2D?Fxb*YYnSTs!t37vH8ZBhqXVErDEwpW*=luCI z{w#H&{Gb0TqhdkBddsQ$%742EeU@4#Wf?#fl4K=Ej4g>->Q>)gA{xm{_R!(a@@R5; zp<4pw&**Y`<)Jum?9o)u~XghCeB1s%!Gi3S8RN5k~)fzK<=4@za87q#qlDe)Pg z%!E#fH)%tKiweuPH$N#0Qb^q6!Y1M0=P?2p{){EWRpu@8d`XMR{OK|pftjY`zxUaI zuDC}So~-e#Wx{{oXWUul46|pgUZRpYZzTel2;%p%KMTBMHw^rm9e&$CI1y^AU#yghMPLzd-U+YH5&;7k zy(oaU!ht$_rsk(rUUW#V>^v{dY9GwXEN(ji_}f3C{k-B72j}5=^9o-zsnm{^hT>abA@s8XqoOL!)9w_Y+RI zZCN3&Dv4af)IAnVm#qBs&(v>q2$C~k{XH~9WE#1jjYY`YVXRPGT|C*owY7l^9s{JO z_DOEvW)Bdwr<_!0<@>1@=UPI*!u|P`w^HLX1ZxN)&fyY#8~N}56OL5-&yr`YN&co7 zENQm2gn2HAJieVR`x!;p^ZuM6bHW;K!}`~MK+J6hm}2@ZjtGELygaKZj?Ajz-{~QN zP!1qqwJdx8h6bJy&3(R9eaR&w$TFU9x|R#NG&TKx-ZEtEv8GpMddk+Ys0I4Ub^9MR zwQ*OC1jIjs4`8Az23+#o!m^ExmfBt+!kjFWfT@v-cA1b_3tCI{-)|8)2pUWU6&nn> zU0*W(7^dmxKEr*SfwIr7a9Dxsyg^?IHFf`9^n%0mxAwaQBQ!K*xg7a7+!WDk&RBa+CbC6s>9&686W>fpR_eDZD(_S-UwIe%G#Rn?_FtGNOW=T8P@F| z9@sNy0CFqDmeZ?IBZI01UAMI2)`F=pw#W@XpO3e3RoI^2H^Kf#aG+2ovAuN1ou$TZ zww+pr2^547ZPqlxH6a%^9}L~&V&;7Rz`T!*f}PUlFm}ZU6ZTuR05}&#D?fOzg;iHX@RO7Q2%cV1aXGAULw3$h zqEg>uBkCd3PPSd^Hcuz;XQ!lPU4gJWs8@oep<-0B>R&IWEq(uD&;hhe z`&?KBZ`FdRtRduA(Y%zO0hmhSGE+&f%cx*VOLA$F+< zD3+3Pz2vm_Z(!lK5Ia(UEGtOwsQPt>sYS|w2~wt6-(!|t_cOQlaqmBA@O(*5g}g=X znbMV0^r`{?uI0(~!H5=LoF!`f*|ZZ7Au&V>8Y;XcP~Q%=44$icD?LH>q3NMO|5{(! zDECH2XW3pQ^H|!X_Q2$ue{+d>VV2g8t8&0+vsFzTn7=p6upoCdWGx%~1KN?ub@;3+Z-a==>awsbjXWvTn~5Mq6`vQlvfJGw$#Dwg@+bN+7S2YNyyD~Wi==*6Q zI;vremptVMdZ-Av(lYU68bwsE0+145XH$WNUoW$2W|CU8{b{Xt!1c%xTP%-X{wCpE zCCa7aSfHxLhxz~W^#3XseNXcsRPR?2G^*Id!bHqE5}ZIfGEtZ%Ucl@-_#Ja$o;#Qm zn9sfenFYo|3+8sBsRag;h?as11A@W%yk<{wp0m&1d#$d2-77MTS!T~NXHx1#Gli;P z^pdqhvjk$bTW_)o1INH&$7lCfwUaNX6y9Slbuiz;MBax=7k;bkc}}B}S_FN9ETMrn zKRbE1U?)uDS^nLH7<||zKr2A=t_&6YIuw(H1>-sCtrHk-o@>~Cc2J6`2 zzUjrcp0xSOr&+{E=0kbEuzTuSs7A;dm>S-Khk^==MKLUmPDZ#ZCi-3MDgSn(zn_MX zOOoe_vRc~vsb{aYjwErH`y(*q%wc-L&Niw#aF*P&$!^uZ{TCMyb(!@0htWMjMdO~n zsv$$&;&y-<&_LVdoTXsGy_Oa!8ljfy4NwN6tytvtmw2wynMBWKE8Jc_w_i(+FgS>d_aSU)UEoXR!P1H;(BPaTodhy}dlVEfW+8mfwDbAN zIafK$##AnqA6z20mwh3S*dHRA;-XSFWfDQ8K?nwtm5bumQ0bBX|Mp73&yc238Bzd7 z^#d>+ZW{NA!)*6bhWTSi9Mb1)1IVv%|NCaE#d*sqWD0=*AZg8Z!uc2>MzucL-}hNt zTtVxDAd#$zfFP(nt4-t9FHPKU{@yZrBHiZH6lgIpLiR-s5!9SLZ_z;Ss_t3$GG_n_ z&v?PYwDrS;iRNryAZIOFsJY$WPdq|9p`Wj6enq;y!Y$VP%wK)W=Yp%aBKfDDa0G-k zglQp?8IDjR=&vPj4(ej-%of7B3R2tS&&bgBiV@bWNfF;JeEmRkII&VmK}#-v?~8IW z?)fa2TZonwC+SrDc&SBj{?tRkjAH)Ve@unb(6DjKmxlo3e}BQ2jgcH;88PtT0^sz9 zWh&cS)2 z{j!{5R<*VAu(>j#Od}QeIPc#8;^i|zV<;Oz{{*<-{`8Y6exi8#HXV}7%z@JhU?PN^ zt{Xg?p;E5x0pQ|B33|)iw6jN6nGhxxgMKAl`#HTp1jm)K}5;R>V|r6@)8V3Md;DWh(k5P|ldBw+WiUE>Z%S~fd&JSg_=Ztz7*KaNlv}Y=|m@VNp|6LVN=(a#2 z;Du_LXc-C^`mqZ?Bpbb7#6lb~3N`k}usG!k41i^W5N-n|7HF<(%VfWbruF7bZ(A7f zT!iifkkOvra$g1Ho}WhyYz{X-A{rqH-WKn93GU~jmW;MM53<5)8>zv$g5sQklGJqi ziwb^0sXfN@N(hlX_cL(6=Ym4PCip9)INPtH?6Lkn_Zey6V~C#r+5a8qmTyygKSSz0 z23T)9E|g|VL%A{y?Y8x=N~mIWG*{_!Ow4sJ_H}L8*>XixsZM}XVG%4+w6K26i)3b< zW6HzoP%;TC(<+PJ)d`Q8kL1BO)i*QKqbipYGKgNwu4z;oVm|491Q->33pf4lBDeNh z#k~onKn!ng&E+A9&5>uZ8;r?5+Ij8HF5-=|>_#fQr2)Lrum6wXn$?>ha!az`K|hAq zzAUfkMnahbKoMJ;VmXt3UyN-QK}-@5^Tl99Zjo>911WFHYou<%8DhEe(3B?dLOOnz ziVpciJ4e>@>Q$8wo&c~h0#4&J0Z;*+dJSulAHCfiES@!lEBt18JWDK!8XVi`amecZ zEurDVFvAx^=XW!a3qc09nPgEz4x5LIYNgRA_x8)RGhGa()|aw%!1i&Qf_DN!YYL|o z!>50%(6dFB>%NYcWV8F?qCUw_rguzNAX`jm;DUv!KSKp^Wy|^nC*6c#voUm75Yz9e zL0Ra4vM&D?L2ur0E~-TA7WmtNIPqF?B}*n=#ZP%p_V8F4a;Zv|W|;t(=4mePGix>+ zr4vTsLf}(1c%?ATg+dC#4ao3%5l>rLU%IE(azP>rMU68Oa6Rm>7(1Vss3%I#a zoB7x3fbqTrLxHFQpEzLUO<ZmHq_)ICU8-DkJh-`}z zZ?KJ@)YFD)0;l~k$*Lr(vJx3fSNaW5UDK56RgjV;q?<6VW8^n-^@9YNaO%amIQ7;` z-Z`cX3<1RFDqafF;4Qc6U9T(n^C4mmCbRx28Va~Zso|++FXHyRN3-sxIfrdaK&lbD@3W}I4Sj7~8 zU{^r|u$tw!dcr|5*w5uz;4mrTu>YdN2Cq5@NdB{mk*RH!s0yY2>w^Hh;MA`8<@_y6 z|Mnlq5I#R6ez#KGX&~(N^F0YyF}eSkH}a3KFL>5K`Aje4;*|mOY%xwHzjlvgpQTx$ zbId``n!3K<-WD)NyU(B*-I`+8#Tv_rk6j_Lf(c(!Vn>X?UY|d<_m+r9$Ub?M4-eZ{0mnaK{u&T|7bYTQ=js*Zl7QaXP+{dAdxjlgtYAB;e zu>X5H4|pqkE)_A*{>*oz;FW&~2>E%< ze<1Uj^s9o)9qSfqWL(>MCp503v>)1EW>SbfB5xAzjwuE{DAsA;qS4y*4_2Zx1J3iZ2lFN9T0QaA*g-QdC+yA^N z;MSn0V(C5iNgQem9QCkYu$G9Z%YXk-C^|l)a|23Y={>x4J75ywpNP0|as|x;+7t@-6uqa`1ujq{Vp(uf$xn-1@bRJo4yOb;C@ll7 zRX=uGP-F~A`I-Be*|S@zxQB9TXv$ zE;&S^J(+*@o9Fds=>T>0d4=r^&Q|ynXyTAqF$t9^0Sv+yJZI4{CyQ@-i}3ZfV%BgrkL9SEp9+0pIs*9sY75_ zLTXmp+CuV=RF|_UTDIH(rt+(P%lVj-5|Qa&MRb%r>k&d}_5Di{MDiRi71X6PgwgeP z0w*OHFlE$RMT~yxbW*x)5WtcK1b^QBy8ySRAj7?MDZkzj>~TURwdF)eO7}BUD`J)B zlH{#lkovD()(SL#8ZMWFwF=mqJa7FoVuJ>f=7LP!^DU6PIJbd+;x_!rnoPTL)vpvN z)P?9k$nEpjGKYO1codhI{15&s2Qd5p@V{%gL2uPS|KbE|tKy&sD@!THF#Ri8@psG= z4iTzxs{6;$#IF7*v<)TMt}U0qfvj(?)ufiVOwT*bZG3ndVWldieT#Q+OR4FGg6f4B3( zkB1R1f7bRcl~(pWpt8ejBJg z*XzSj77AOx{R#VRp#D$YtSiA5;R9();CbO@=eV@$zo)XP%?-++ms2HnIkPZOcCoTM zXW_qYO&Z9-WUAj7zwP-b7}{86_U@WW<=GWed%_fV{l$)D7+?s}KG#Ye@zb15G0Fb# z(FplisHda-E{Z;YZ%v9Fo5f** zTU@MXT3WgwiKE5%=b-;|zybKmR&Ac715jUttK~gIt?SopDT^L+WK$LQMUYYn2OfkEGqs}+1iR}iKla4uObh1K6QQjvJM$iIUuy$ER4(-y6)}5|{#5kp>wG!}n zK_Vps{XrsewTJjLR=GPP>`Bq%A}XMz`-aDTT#mB8`46Zh=o|v33B=A>F9>{)Q&Fb1 zy)G9O4U5H`oFXj$r2bC!FbOyQvbI|K@?|N>TNXuTGr(yiw^jdoSCD9-PKj0am!qbgR{k{f=XD@r-LUleLWG>8<2mHK5Di2}VhBKUvBR>aYJ z?(7e0y)=bzg7hPN%lKjm-Fx;ZAN%gs&&%sb3F>!qc<16lBhT&@Q#A{%dbuaBcc=I{ zL*PL$g@(o0Xx^N3Q6Anw`LEk)We*epsn1~#ix_yz^E5n{Jy8J2Bf zb3~_Kg;gLe!lgcy8iwZPHO@Jyi$+&m|C-bSf0hvxtg2FJ*U>#Mwd3r5?T`=)a_}`a zOUoEPsHTxL=7mZg=y{I8R*&|IiZ>KHAYIn@i(3 zdewKoW|aZp-O@~0BZzzWE>V^1B^P2&w*rG~d7&e++Nd+GKeS+t$s>~~rUm>y`v6Tq zvcF7V76>xry<@GM9p|<@wHQ#`0MlnZ#Aa5@TmuFnP<87*nOzWsDQ3!cAbg20X7-aZ zJF3^e2KCj0yii-S|6~7|i}LIiz!=o-vURA(g}4@F5yQURMbG9qDT>=OD>Mva3kmOV zJ2_5VY);i5lxgu?-*l)ncYa6LE$n2f;l*?q7tVfxGoJ@LHEZ|Ngjvg|qL;<)kWy)m z$l?@gYDX@K*@{O37K-W(O&m#>}Rm&84!mYkY z>;V;B90E!uM^X*PVoAmW2OH0^wIJ@i`%kgC1=B_wS|x1?{PDmaO%I(+_N80;BpCXQ zlWYBE$sBCWf)&L@ut}m3FM$NftwGbk2A8a^D6M4n0J9*#IeeA+ln;5d3l0;{8HOq81l$d8(`iT-u3Y)})DozsYYp zf*>JXC!VzQI%zQe7DzK-2VWAuinzq5{*cZVb|dYG@Hp zwcd#I>TiLe;H`P`EaOMd%S086Sx2;no_*Y3eFSJ3=uMwQD&AIEf2@5gg*&;Y9UV`} zl2%(P{kJh7=nyfZfIvaG8{Atuff83TjInxt;Y}0QOI~$_z)?YV$9kyYuDn{KdwF;m z!xKsG0dScEr8=T|bB3&LrFWI!V>!S;0Ij-r0dwN4^{Eiv{HR2ktNjQSZ!00qrER5X zs<&V2C<+qBwu#_17Bf}UC4em-{k==@@R2}?F=jeSiZtyK_OO^|876#avk|a%-$YI+ z;Lky;h(4+XN;rkjSP=506sx}?8&mfbTM}QG36ua=~2h3I$gVIL{RqOulWxEIf!^NlR z=IiRiv9YyPNGt!cxz{II%8|C?b;@I^wK2}7O2UpyA>Cs*v+hz3^tgzbgBf_A#Cfn> zyizR=$vB}}^~I8xt;U}@_r==01E!ndmJC3woTf)L2{uWr4@G(pubCb33THj{#Xk&m zaG+z9sKr(c7Py@jQk-k!7)hE3yPOjS<+Kq~7%LB=s&o}qN$l#tHG8KpKKdoC+iE3D zJ*Rl13lZVzAOIK~v72euPPg5NuJD_YZo@KPLSQg61}teM|=NS!@JwXgpO)U z`^$ANu`?#X@}pKc$qRvn>Hq+?6Wh31qcqZQu(gmvU)D*6th^?0i6Fx_fa4>adg5!y8-*ALOlfpaAv7P(*x%agUJw?D_VsK>hdgJ?6aSnhv*Y zoH^e%Bu@e1R%+eYblxW>P7K#J7TO-^HmqOH+-G}T2w^i6sB!X`3AY4b@ta6Oqawch zapZ`3@8pcRg%ENWGBs%6RU~AZ{QhO#w%`!;J^)N2vCfT@xV_KqEFq*Fz8@oqxb+rf zgy27ZiUnTP3JC(arwoF`s3vU_R6-`eR?=QF|D%7cjL5Ds%(Q7TjAF1CSOQ3|2%j5G zTUs*6Zz2a!Z_jfHpBEkNxk}2woEjS9{dfh$QZPJgiEl?MOed&JW2x;cuqP10Jr1-i zUXG3zDJ^P*R5+?P&qmT}8n*V#^G5Dbum^F`zj^P%ud0e`nb1AJRk?z#d(66ikkCKd zo;}4u4F%@y?JXdHwhW&^1?eS*8Z_t3Nq8xEk7*{KTOlzx@U%m)zgpL^^_w?xMsorV z_ZbM;>MVp@oE05EiA$@2G=8!WWtUfsLVz^r)SUagC4L3ebP#ftDF5o^7okYds&wj2 zij{s{kwmIka;#*Qa1585z#z;W-Nh)oVe^kkH4BCM&93RZ1edzD+)0xHlnq!KO)-D+ z%gIKY^PLGQRldb8I4@%8cZPRmnIxLDA=i0#SiPftTWDZ1wLlN9IK=Kf7VoH#D69-1 zDacutdW?Pea43^NDsXNsNp2V<9He!~U`!RX7M>Z$)}D~NwC6f)CHaH*8RG)_CLQNj zKM_^#6pmfnS)56ddMvINuXb{Xe=n1sZp}1&dD~(^I9wqN7%wG(9<y|6ocVl@lf{|p zTewXQi+>M*)GZF6>-&N-+_v1`5>U3!|LT7kGfT8-8F8TvTj$Di5Q6uN_L&lRg{(rC zz}rhITT_e5NLv*Q2R?B{b}?Du;#Kmy|2 zRG@JCo}q%<1hJ`YYsD*qx1qogT>6T-3S{~izu9urLjvHrN`~BIP2?q)ex1gQ+y))F|ajI7FyT2Uti3rAgs-=71NC z`kdHQIGyv#@>%KjrqVAf7H?7G^X6}-XgB}uKLwctu1$TuzcFVOuJYggzi}g(L1c*f zotp#co*}2S+^!oiwVIx;1k9>^D>=7htPIMaINZ-4SzI8`rkry4dtSH=qLW*+U~-xX zt@>-_KEvk798ob?oRncL#Im{Pex^pXiuc5+dt!>&bISzw_c6L@Mu4M5!IZAb*igm4 zUe(Gnx39_w6|;XyNEZ^YGN*10Ye1%q?L?z84c3heO{XH8NYEoLh;wBE2w6KKOu4e{ z@E0{gYV|&0h%oa6=CY^4>CnOy=W{Ejj6DgVLCQhJIZpOj!F8Vmzf)x(x{dU=r(l7% zM-0(d{lEUNQKK9)Z-e2jP=;3Imfabse!aY8?sEZYs4JqUa0@aGy?U(gSsuLXZ!vz% zC0suaquVsMQhtv{bqm{AAk+wj+lF8UvckCPRXEiPI>MG)?@5gOgR%Ptm5|u8R!y&B z{(Egg?>%riXv9$9gZG4?nVdC{{+>DWxe5{e5>ytqgAH1?mIUVaV+m~`ur!j><~>Vj zE$&(L_XL*0MH+l-3+`o@ZOB;3elWG%hHF8{DcvZM7F#{quo2u7MR2rCegD2KgWGZ2 zkg}*Dn@$3nOxfbPw-S`RecSn0h&Zd@Ir?jSe;gJX{@==7KVruQn(=A@AT$)loyRvg zAjrqcH8ERtTLdq)kk}$1z|eV1+EkJ2pPa@0wTbaa!o?W42j_GUvf;E%4sbJtArsA(IM$-RzL{>PnGd z0fivaT_EwHL^6uoH5SGwH@+lLF+&U|`~3Kds-`4Xh$16E%1-sTkYe|Zeq*Y&yQ!c2 z*6XE#l8cp2L({8EtjCvfYNH|1&n^~?u@6Ke)`Ce%dM4^`Uk%CIEj+{8w!pjzq3kQw zPE7{IQ|GQ5i<;Z(Lxu*jJHdsP$Vv?>e2!PHc-m%shIs}-TUv$PP{VA) z&JvOKy=C6^zp~y`;ZeLa!J3qv)-2$WR;7@4tR)MAX>i3LG1PnVYYlB6NUigQ#fFjW zO#pkdh0vm?K`q-;wPDr0MV)~d2F(GgdMdT;HUBZbQce^j&)&?X+KI$<#1F;Q_fo(? zQ|rIPF~JTp$JvlcD&Z@Uf4hZ|+w>lQ{5-`1SZ*dP?4?^+(4U2T))eC@cKOuZW>-ld z;hEUz)hUduw#rRPMrlj#w_-V}Ne-7zb_j+UY_1(DHW?G)iygGiGnHx(<|mC3*1_`) z%G`NZ>qV}52#Bh?RqqBygF6p)Lk2)EdX)KbmMqsfIN1-AgJy8%0a0S@QoGD{w6@wx zdsBeJ`K&Nd>^$>D#)XQcOWuz)$Kg&=c_OG4*p`c&G zix-m0jZ5ywWjQJC&V9jYg1m&_DKR`)w%5E|pq$#?+ydnOGG;=DMIi0mI`Qh;|2oQ% zH%=Nzllv(b2^?wg*hObRp19{Ij+f7+?X4RLl}_ppxEw}XQ1v30EUiAMY^8@FP@Wjv zK+%IuxP`3uE3R?5OBnC0(y0%B7cbc3LlM|z_@su*@pV2OzBky?pPY#6L=dHNu3kif zs&~N31bqg<+2dd*api4JUR#&i-+aVXA+4!WDc|mawoU74`9^n2*SqL!38;jKc}*TD z3cBL^S~ucD~_ z>e4@Jjrxtj<5iA~Rv&&CU(J#_PmK2V1LK_mrUWCd`OC|ci6+N5L7~9W#OOz`{JR`n74 z+mVH96#t&C98U=D4n@-0CE5#$e0v?3`384Q@0nF#+N=5*NHwO3lYje9>yoVb`5|QJ zJ#PhlD*z++TQY%tE|~OIK+E(#jz76&pa=87%7|2bv+w<xM0HWc{cr<1p9Wa zW~;Zh@0^H8ZZz$+Ky1zc}yF5+iL?3ox!INM*|b zfx2Mwyj7#1Mj+zSxEQmpAi>nsw-tIGtlGoSkAJPO0s=&shYJ~8X|+3-L z)UP2c&oF>~-EQY@k%;Wh+X15>pidkT;=A9Ug>x>6sp2~GeH(ZWtbp87{7+Lzy%GYT z0W6Ui1=Sy`-ympXH>Pv;hOax#`9Hy`c!M|7Lvx5OX+ra-UVU zws_m}3T0}FqnLq+k^MJOXP8=L$=WWdlB@fLXVc}pXthoNrKvci@2U%f)|XKHB<-rPy-!}Ba=|#9| z2|W4{wccv74vE2G`N9c2=lN^kB}NgH4f0p_*}YHF7H@#L;LxPzsX|d4dCS6|K4oys z?SQ*Rxeny7hKpR_*B_l5m_iVgdac^xj&4E@ zl(+%II)DQ5Mk0Xygk4hu1BOw3>HkF*+auBz7fHC?10 z5%8wO=Sgz<)1lNQ`Y$IB_M_9B>z{7)K1&OEvGp zrCUFlmF22+;Wlx@pHdoQ)*;dN!zf!yHN{JEv5I}% zOO8qeXrS@cxsQ8k#$M*TC+%ni~5M zE9`F0b2q>jQ@G?-FU$tBTvQJ5PkOItlCDCa@ zO=Z&2Fz-cv?o_hRa!Xt`VvJ7i(v-&^hX{RG)%I3GS zuM>s-LHlVkp0Kc|SRpTSf->RalOX}|Zn;8iz-6OB`6W-{z8hW)vMRW+zw~}V&+7yhvAm!jwgGty z3w*TC0>lr8B%dg2D5&*n+%Z#y#$bt`5-wCH`w65lurY;MHWt|1T9S9w%h=(ypa861 zAI=5Z&uOr?7(`R!W?$s@=CTkG_(ww%vvgziDI2kox-adNgg z&S00=C44#hnUKrKyx5S|$+vDsps?_qKNHAh=`jF{-=wyGl=7glQ|v85j`C(JPek>d z+Rsul-x$5HUs5J8@~#%v`Pxdr(*5&FgU&l!s(Qp6vqy@ZH{1sg7ZCD6 zfy4djkU7D=EY^0ehI;Vs6*lDV@)*CZ>{Gz$^pgcQTp1voWYJU_>hC&1fEylVk(tX1 zXHeR*7N2=30Q<{9*6qOElhlT5s=6x;K^3st@^K2=^XKW?b8JzRi2UrWc){)|E=ucQejODwS05OUrEw(sZL2?7a7;5L&WXNtMSHUMU7B`5y8?4^~O zUIDe)iaQ9;BKYT22wLG3j55sHJxOK3;Bm}lo702B530uDZ2-R!8^^0%28tL!Mmas_?qiMidC`zJ zzlv*90MWRqtu<8U`NHkmNH*EK*pL0wArBUtA|fon%xBLvE68@`{=_Dd)_WG@Qa`m3N?noVDFhro_jtAP;TM!{ zDchOH=?rAJ`cfH1Rz=sU1ZmT7#iS*KuB%_Xx#o9HsGkW}z5O0+k3n>>*h(#fUPP#+ zX1^~}Sw`Ytqk(L>Cm>;RN&gT3%lr8@vCsbSQ_TthYTHq^7}Ligq%WYD5#jnR)$@Ed zc{0kjse5!&{MVA0#DU3sq$QAda+iTF`-;0Q^l9Rl0Q@CX1vA=l%V~|7I?iI1tS2NV zKgXIjW-rd4&B9i!G(6`xeiev5DIhkn7>2MDbTh5+f$w^E97?WI|2+W%)R08s%j$&) z2iJyk3?=b`or0Dl`91^0C>6g$9JYf_1#nIYzy$&0y`s0OZ5VlLxlE`U8Edunh5fl( zP%0pvdR>m}sHN-=fCou3zm}@W=O()qpzqz$QTOR2jf)KrdQ--NeD$p7qUscK>L?f6 z_b!qt%HU2FQrX^@e6$hHD(l5?7B^=BEo&Vdd-W(=e0URSVaD+kwlM|&UdsdPtC@nZV3$`mzf>sSxm7` zudFv7eFjL?y;1d}2dORW9VSH{DyZd>{L7PKhy-fRx(FA--QF?O*yk^}UHQJn<9P&D zuF4G8INnjPe?BBRoiP3J)X8_%XiFro*xd3%p-t)^W zydqJ%>Ms>JpE1t2w`^(7@^==YqF7rla|`b?B+IE=8HG^$7w`aR8M1u59rrZ99ZXk1 zyRVq7S}snE`P7$8&gFL6z;tf^wtoS05F@C#;Pw{GeXh$V9O2A%-m?{gE^rxHJ`=M5 zFyK&&A1#2QM4MWGP<}Tl+`@F6{$Uz9tjg$ie*W0{^H&kK=H#E;!7}HjSi62URbQ$m z^uc2P)(B{cD+`!su3@&3&(ycxgP+7LllaBSv}OnIiQd{yY6PHiYkQm4tpu!XuiQIlMgP96ez&&e z>HBCD0z;W;FLQn>3CL`bu?!RG1-C*O{c6tJ3OM=x{Zzu#>fG8v$1;9%BaN_DmomC8 z^|qNZtutA7?@G7k^A><%(Nv}v?LdojA?DZpERKQ#oU)cMAz3k|L1c?t^O+x1IqQxA z2;u$Pv{EzltDdP4)EY%C*)!_1@rYx9S+ z3Si1`bv6no_3{n*fAmuAijbu4s+i*k%ZslD;6sDhe^xSUad?R~Y^UDx0Iq$3mt5i{ZE zeanzG?&K?Ndz+-0c=m|{iW>sDCPkHtc!Ef{qunQfQIptN0nWex#O`3vFGAIv>hy@a9ujp;4(n->Hy_qv zKUzhA4aFxO1><{=X#WU)!&1-tVcqiAfh)hXVj}x-W=pA`yufps(ebZ^-uKEu{G!r< z7Z6}@W~?!YI@;j#A0b&yM|r}ybl$Gy7V;*q2Oiv>n|zW=@devP(WAS&gYUfZ6G<%A9GnI@Iw;5m3%TQ8A|B$7E%{Sn^045t$y>|0Ykvb49YlP zbS#Fk+TO?R%`}xw>pec}k@_VO1*w41Yy~GyMTQk5XX-r+v`!HL-B|Al@Q>br0O;FG za>(SQ{to!-2Hg;HaF$4P|1Nr5^!C)MR?_O97%CABt=|`&IP~)soz~yqo|isrh6M}v zV9AWgkU`@LVS0gp;T(*gbq4_Lp3nZ05_qekg?`T5FM=>AgQNN(%W%tdm+5{VBy)Dj zttd#}&Zcw^hh-RfrdHNI-(og2Nd%<pm7j+SG4Ra-IQ9Vk8r^SXcM!6|WVt?k8T9j1f_5f<21m zRA|CA6`vvd3Uf?ML3|5a5nv+xb(HBgw*szAkfv5fO%I4)>upl@7F`Ho%J-9Xg`sg0 zM+~XqLODa%f*m6+=v4#Q8eu;J^uP_*4pF9RYXWTHpAZwKM4Lc4fwVP0=jBJ%0CB>) zPb|HSdxp=O-ny+@x^319AqUd?xPIdMkis?{Mf-dqq_(cwrf?Pdv3|v!mhMRq_{pH; z5!`f0)6d_T&nrOF>tEp~^nQqufigO)DU(SJG2a0dYe`dC{p%Af>ylgd1cwGaS=tXp zFA?OwQRW{|+;@hr4nNbx!J=I*TF{j7+PnO05h+_RIogJmsYV=DR1H<0IrJWxR|UFA z{qFZ~H6U9c76$<<50PY67FwLjfEF&6S6E3yjLhRgA$6PbihrV!8yZ}F?q(HSu3qI9 zq^(BE=O;ssz|T_OtG``%V4jWVkl7&5`RgGL5`V}-X1AVF+xVS~qykv$Rk3xk!jfQa zkr$v)6r83AYmAU7suEQ=-y#Bxxu#D>LOCEfHl^cH1>RRbgu&cYL0Swqn-%l7<_j{@ zadptdDO5g|O>f9$1UkABSir-LiecR~=7lZkmrdT^BG#wk8y(YvWko_&$f9}0dI6pr z?32t$r}iM=No^!xb9{d=9>Q5N!6H!#X|4uw*5|KsCP>N7hXF@wRqsW0r{NnvKyHU@ z&k?{J(vYgQoKDWNNK%^(UqOQ`EAz62#|Ga{6^iLz z`L(R#Nm6Z&p+@srY zwwA5TxDHVM?KHE5hd>y+wp`EP_jjm~L;Tx+l0rx>z*+?eP(xrV`;3VDN7-h)&2sj0R~n!VKVGF^v!cx&P-|KUU?^<7#niu}n)^>@+b9Y^*NQ85=FE9} zIz8l`S^boM#d7WWKH+;XpjJZ4FaW{|mkiJwA@Y9a{qVPgG9eeTK;S+$L@XP-MNTE> z6;dI;RCy^2>Ms$kv@Rfmi7jeU#_BAc1{}t!MaVr%4GveE)$=Vtp6+jx3pZ{2KtB?o zma@3YRtWS?bCqTDiw+gMYKz|UBja-1j${C`i@$OsMw6;npoQ{6*wk;j5|!=!-Rlw3 z?~{9`Knu$DG_L4lyq~+@w*m%^8~X7b6vAol^Vdi!usC&VTGaKk9E#CBb>;U#6mXsa zwRKtR^R2ckjI;n06ue~wXRuO0(*Trh1ns!x+ab+qdjAR7!#&4KG;!M&f-Pv$7y*suE|*CWbO_BpE5OgIMA6hf)6-eF^?qK$0u-zW0KcA1 zejoVP8+ZvP;ypFBjTh}aZ{KzqnhH>>FOP}3Xsn_s1Rz%?!_@o+46Gmj|`fX4Ja~pQ!ZEjiMU+3AC*6PG(YUMecQ1!N6 zBIgPS`fUg?_t-@PTvFy%82rF&Ts0&xBDiS#!Ic&%U|leI*4R((S#5fux8dNrh_F!*ltlv+|D7o!(aM%WaL+4Hh!zP7sQD3P5L>@y!MH2{wfi>? znfD_MQ80sHYq^9{UPYOXc?Mj6Cy^ln;7Hzf-Oq+|mLbWvrj&LP_;Z!8-!d69@H05K z`BqSi&mV13Tr!ONQgzR^5jf>aPx1ZSxD9S$%V&Q9fRs&62_VvcB>P*uf(ZX^55Ip) zkb|=X7Nsof5>b_Z{A2;6Z^sb4e?x!XUZ&}MA1isr`@C!dOQ-hlE_!+;M3)d5UPYuesJk0$Qjdp zPn>vTySxG-pEW$2JH$SI3#ON_7x7HSdIQRyChsg)U2%GF^QIU*iop>{`AuCqVAb}cp;JV0 zm-|ipVBRpS97`R1dJpjasPgCDXH~n8`QWr2&<}x^L>TWyTq1Wiag7&)fQbS!!Nb+C z5P^@+5?Q!EP&<$5BKKCbVK5-|9rM?uLkR5~%tnt$(}zKzlO};(FBVUi^W0(UZ^=m& z26IQVDX0Mn%}-^oXUSLkq@afwH7fn2YN1gi5fG}KW8cHtV-E5U4e<`Pc6JZ2LS?)u zRf=>t|Ms7#9JsA8fF=~Oy{wp6e6CLd0H)iarNNovB0j6FJ&6lR!Gqq4wvuokE%c(a zL}AYS2nj{}4ksA7b~`=FKN5)n0>4n3`wC)|sT#rn%b;G?fHwShSfM(O6Varuj3QNNZUa1=jt69~htv%IR_6=cbIa1Tl{{!aQ%KZ*$8hm)J^z zO57mQ?-_yvBCIG$L>bBubqfKc)h#zn-uw*C8VOT{Z`0_d<$yY~K#w>PwKavzN0U7- zRU9JWKK!a(OWe}xXD#SHF#0VTgz3La7{Ul;U=D^AU2&gF1Z85xvg+GQ$m0D;NOGqp zBACqM^XX`)DGD&!!+c^Z{AWZHK>yRjPP?X3aoMb(rxMWH6#iHWnCaVE*53o0c|4M( z(Nvh!idt}SUYMQ!>vDXE>0eQXZg0=Yo=gMNx0y^;5^?hRo{@kN6x1aMn`LbO$Y*{% ziZ$iiTA=1Z5L56kL|k~)+&L*bC~1ky{~u5PuWI4fJP$(kzK(&QP=c5Vl^7V=AR-8Z zof*#Z4xT%h?G?&8yS72X4r-DeOf-r@K;-6rDzLOr#VLxX-YjsywRd+`@ zYo9I7RJrD-Jf;29m+mu2L0iloYvRT}Up96G0WNfma6uAInXZHYSLCwHSA`|d^Rba4 zwgbVjOdv#1!;-hJg4Te6xT!J!^*{aa{c+EYtl)Ds4J{D@_?>FXK=4ySCC~A1R;g=L z$PnkNvQxcp|C)ZDVUSSPW%)^_7VYiS-YVLaeK;<o46gb>m}1y*?#{J$cI&D{d`}RV8#jWBit^f?h?XC9&NX|utW?eLbX=_<969#^aJ-+M|@9#T< zdY&(-IPPt1O~<`5E#AYoWX0(urElL82CnE887)ovw-`4l@r!>-gC9BaxiGondbQWOkLZv_~w;?3jeAQW{1 zoUgdR4xd)Jx*6e*w}p)6;G|9&lojiKtwro;Ri@Hcl`28>O7y%5eWv5qOj6<^elcfO`~M zdi(O=QA4Qm+g0CMEfS$J-1DVR6n%I3p}h%eLEh4R3czq5JO>O9`j8p;O*SShEt0L( z+Z;3Rwj@~L27-y6tNZNX&8@OLo|T??Qfc~fM+rmLsw z#cB8-`Bc6;H?1nF$6CAMcza6vBX@IydfUtFvU)acA)99|>JQ3m|gF=Ky>Unk9z9y+B>tBcz!6K4GTb z@B9iOy0CCXyeTBCw~5p+fE!z0kpUw2)sHSfgv$MliITTCg_XnYx&jc8+ZuWOaBE#! zdz(6gc_g` z-ZG&3$*^vL3lZD5u#iTuwuyywpO@NEa>M5$>Th!yYV+LmAS^Kacfiyj zhD%>ChP;ro8``?BkG9HGhZt%QyLXr3wjOM~d!uX+SWG@SwZ%z>|PjC<%RHKaXK{n=?OCb=1oV_nEvR&L&!HmVcDF z-U;nK%?A*dC|8H2D#F8JomBmJPpg;XTbN|0^vNB8Dg`rA&5^jqqfwsOEepk0n-xbLyd&7QUWDB9yf$Z>26pF$80q)C zhS<;w<;{WfUaLeL)}vA4WdbaFZSAHQfyl#?BsH6>@C}Q`0lC5Bt_xDy%Q7}Hx405K zS()_g(qqQ`Ufn2*8Wvw8{@@y>3Y(VI(M~*Kftkl9M4k|p=`f1dkZhXcz9F_;T7t2b z02rX9Pl)ag5@#=gx6G|&hP>ql&u_cRqd;aD>tcsl6)y2%VZjY)&u#O7go|_Hfi%$) zrD`0Z+$2a&hg72lffYT*Dtt@Qhc4F6CmVNGxLu~i@EZR{Qr7Il!;zrFD4?&;jr*pM zmp?~!<(qq@&WV&2os7#5tG50Cu*OgSVL(n^GUXZ*8djz3w-G38?H7LT<@*q)I2*5u zK64bwYE&spBOpTobQKIy2UvvLTl2OSetc|I6?Y~$%iu@WDYZRsg+f~F3vU91$~_xo zz+*Btyk`+WhJhy*XdQB?3 zu?Nj<`S`0F+WGl3YtnTmt&EElk5`{^+l&0hPtIM?^#q5|r`H0CDzHaPG+9`u>8*Y< zu!AE1a|HF8go*$xVbQI5EGBTb zp()(Os#ysU=^j)o0T>h$pItl8>()=X46qL_d1Se?Ss|{LIhn=fRAsgRjhX0JP+epC zN!K*X8qeWMyMMy02ld?006$V5Wa-rC4QP|IjTtB0nx!|4R6 zISx^?rRWyQt7#zI!hJZz{a9B7@gI!@KUHpamk(ny9;+W9Yxsa_U zzh_DKr>ael+ip(p9%_>_ZvW6_7--O`QQu{Ho6B3EH0j)`dmN)eeOvh+lT4e7-l)71 zhY2ldU51R*RS$l7hx9xirNb@f@cvYUiY22qY3zYVS)K(=095Uj5VK=#nFt*}_Hjgv zk`VZnW$DE{*^*bwvsw$v`S#S$7sz}Dk%@Wi8W}Wk5)k)n%7jtVVd$Ei)Jqv5jwc{% zb1tY~kyeS7ta`bH?1_fK8BL|F34bK8mDK2;5ACHzyT}dtKDV}9uCbuFsQ<(Nc-f)h zbBkZ)z_7|m!S~_Y$yqZ8LjuYGEJw^O;WoCHS|xt4W{SE@#Zm&F>7g{Mc50J0ncov9 zA<6`4-XjMbio{#JjxhiFuLR>Bt%69?wqn7@?=jCy^mKVhOhXfbhD?`n%-Pz)PdAKy zmB<#7G=Td|N<%}@27h1mt{{inv%07di)T>ZswLjzY??EQDJtx5kYr zzgEdROWEbCKlvQu;q@^2KCn=>4h_Y$RPCNkJ7oWT7U`+Xd8-EH`P&%Hyj>%6Yuk);C4o0&w#!_cFI`-)hPovh zArvc-{jdKc9J%~6cw!y{ak)bOld-vB+S=;koEVd!=E1AA*iv z`B_IGa0Z}j@+^bMTi_PsqHmjmpWXl2e{T_&VvsKT+21~fU#Jmi2&oC!v(J(=l*n8V z=Vge#E^r@j(MH!S=cGqN647~yEv%B_GO0kx{eBfvt2qdN)3;Nj$c|%WjT=Y>RbUruX~6<`wt1Ws!3~gm4i`2 z=W_{Iu01PfpAGUX-?J46WEQe;D+T*ZCzV?P;7)*EC|Fh!!EK{+KX2Uz=wF}0eO5^m zFWU+LnM*%gR|%q!pkZa7jpUHJ{hn>jo_i9b6UIO#y2ns@e73h&HqMQJAi@IYQktH= zEZnmwU7!ZV74{jyFlovahbO_*Tm4VRB@>rPDmK&u)GeTINDH({*)B5K&)cMv#wehP z%BTl2@5c+dLip@-EMcA-?Rihi$;qysKlbf;rWR4Sjj1`V+w#ij>1hKT{TxE6)aQY2 z#Wlrk({$=K>pfvWTPtOoOLEG{^A!PxdJ9xy{1(2HqTOl?f94@&_&N0h!Uz} zbSy)-rOAON_@TEE7&56rUD3U@2+eW?)N(s}a1}t)!s28S@q1V+Y|7epeNIYCcz+B4 z!deH?55gYLc~yYTOw z#HrC%+W2{UH0|?7a#<$0;IgPi3`$o3p|x`e6NvPlpuP|&JI8Kw_u*B|dMe=dL{;P8MQW7qI<>yER$6$dhLb%4;MmQGH}nrl zg5#>smDCu3x0zF6<#FQ00# z!5_=Or4>?0ns)$H{ahnt#yXu;0G5~Qy;Y=)wk%w-ia&O61nMY^MY(<62Tz0tKNozO z^pCrq?eZ2y?5IWE9QZW@^Zp6^U=s=B8U&$5v1XQHqF|X#bgz%5_T=?o#AJ6E^eF&d za51MzL9Scp_B5f|FffWctr|R;MM!5sc)=Oq)(edcLEWaY7s9SW+n?obcV|BDvdmiC zJy#KUGghzS@EG@U3zT(1<7`VP!Qx)*->SZ6w`xa*k5hEkk&vWP`T$z-5qHoBI^;zh z-zn-xY#~|hE8tI;_b&j2(}L+Gy5HRLkn%Av%t9UjOF*>01YAsUvol^5sc$(sVB{9d zh3Y85TXHx6aHG4cxv4(9_75(T>K;S>YY@NKcw!Dd$hw+q7Z(3Uka&ESvH##ITD>B% zYp`i{U>K@O-A=0rvg>&##4WU%!uBzvaDob0n-{zqA#H8;6$>ngcpP6BZkZs!=h zQ?(-%5_&v8a|50ds<39{8o1n?yNu9cG>CAYp*j48CEp)phE2 zRd0wz(EZ}0wKiaY$ms3`{hk=h5`LDUvWj&>lv8&|__}pNhbw;JdSeAHXJ3?JdfyS4 zVS|(2frC~O=fR*lxS*L#8~hoHwKC%`fXLl|?Nom#wHpyVmEKzPo|{Mtw&pcFB5|TP zN=Sc5F*IU4)^x|fV9%T6Q!AuesBS}QlqE}DIJN_#I&_^c;-#-5zasO-Z<0#5pQ#Ic zY~-st?z?|0^#R&u%*^%WXv}i~&>Q0K5@bPX?g0p#>R_W6H^#~eM~rv3OSwdWRnIR( zy$V<<$B2McC1PGBFpyyPb9KSaQmlr!A_6PhFh=JH6>vH!pq$}O;1}h*YMieJPOxn3 za|+KxauEG*R>es9_~`FH0N&Yt>f5b1hriGW^_{N^xXJ7a{SRPIY|Ax@XS-uI<6+ z4SGQa0Q`9Y6uq64d6k+j!y&fL2Zzf@0}9*Aea4-6yp=h?%I9*={q?q1Zk?O@;F5LACC#t@1^x1D%@ikM)E9R02BkFUJ(JYN)7O= z)~uM!RA5=DfzIeHQ3h`-BWwl!?3C_LE0H55rW->Q5-W%-4iYJvo)S#Bk}z{(mV?TW zV8HY;2HLlZGVMazd>d$dV zz<~%YIP5_MGQ#vNFyZ^uYJ_O5vQvOA4|bnTVQXcKC3RWAsrAu%V{a;ok@epT41i>3>S;$N%D*D{M(BXm^^N?ZRIXhj1S$`~ zYACHQzTn#K~Kpk z3r+I?l5Ji9i7vLYgtlVib?@f)_Gu1} z_cJ-DltP+Q{cihK1?I8&Z`{KengszMg?>0sxbl|xqb@khz2d3yIg>3v%`pF2V6zOtW`iC1ePIPTRhrE#exuBurp zCe&oJ$lkv)xamM;kso&{0AdufXUpJw2vnqSQc`6KUHNs3FR_~-S*;=`Dn46o{USmg z-e00wOIH!qozmlZyW^7Ji>2QuYc1HUFKyn0C*__`5(E5v{RB4@-^DURTpqwSv>eet zW{zd*IowIUi2isNm~Fy_7xfUF5{ObCotrx~lN4o$L_$(U3cwSyR0*tjF`}~%w>t=U zQ_2XwsN_#z$;!(LP(UBJ0wMZ53`&M}1bc1ZdRy!9p--bQJqj>HZLPUOWVlBkZbsWP z46Br=3A)N+9Ul<1*ZpG!Slg?MyiQQ@k!=NF5d&||oZSFFO>66H-h+l0bGC2&Ff2)0 z<)fxo%S{3hFS@kliAH>oKnn6H|1`r3qa0`aCuuC@Kj4K)Z~oBdDsF{a2_Gc8^VZ*p z>IHMQ)_kh(+Ih$&mI4`6;GEV_%YZ0qgV z$*94Xg@31l0uTpUWXXM}-=JA#F4hIILBL<9YFT3B>G};-AUV|n9t)O-fK%`bO1M2U z8qp<%!0#4uuV2Y;=o%RBsd_8Sah?}^(}A{jHrLpo{hHcD`xyEM#^GQ}AB8~&z?PD5 zJ-oXYYYOH=XjO<4-wF~G%mtMX`U=3=@+Uck4&1I?M|fkGaf>?u_GN+N^R(6ta-Y}q zjH?--*&@X5W<5fj;XL|jGQb&x1C&6$hi_MPkPie-aQc1aFu-o53%K%~)c0iG{Eb0njMx9`E@zC^fe@iR$um16uA&rUqxU zf`Y-B9B%>C9IsMCgHlHJ+vb0$Fl0x@J4IKOPaeMKb+iTkmM z+8PD?p)Z9r`l45(+RC~ZVg-W6ZGl@Uueg25wkvwipe!r=^(FVDh(uiTSqE^_K##?9 zxeAEd(8LNUvjP)z$RMcjrv=s4o4-u2JH6ZP=*znKteG=mn4XGdM*B=C4bN9(?z4ho z)Y3T)Fom-+>W2I=2ieAop3lx_1ko8LpLHG$e_R?vu81v{0a6o0p_p$ym;d@Ngqi}s zI=7aJ7=JF@D<2OnD-iWQuLx-Aa#$NOm0|yf(P9G`&x=r7+0Wl+dbQ9N4Tc8!BVsDh z*891@8FV%f$nc^dTSx^4dW4>!E>d)l%RV6VG87GACEvsMOwO|J<7LQT`=?fuTK|5% zZs`JTSaXa@H~ zOb1)@U==qozf`~l`~yVAI01WWepl~#YtMw3&s2a2ju4r(TZ&l*{{c<*vx)-%_A+E)PORI20v@Qe?E`-Q?mdIq`EN)Eq4|<@4KxiXKjZc_%K%pAnrF`!jh`vg zDla==xV0t=TiZ*5S!}0909*@MmTVGcbCQNr_aGc@$v`axd%Du<8)>*nSyX zWtRw&GsK@&q?s=@G00ta(uOVtD3s}fmZ}zr7)oo|2@#rWw3)cEbCg)w!BA;*xARG~ zcs8cfFFb{Carw&jF6qcy73>_qISG2{IMg6dJuD?cWj)^B9!!f)Fn%{Iuss830z}!@ zLU$H1!VBHQ3Y8lC56!ic*YM^sr#8pqkFn?X5ph+>(K z$Oe#88E}16MWE=skeGh+lHCw8$XZH9bmbKM`;Xn7dg7=kC|e1P-L?wqM~$z*1w<_h z9g)@2vKu4dARlc8F8D0=%>&1`gb+d7q%LmzZrV)1sUqMF%cbMF9sZxOPICO@tmxWu z13+`!W~sk}!Uh=FLZO)SCS*Ob{%!E~P?J`cgM_>Jxl#tzbfD8z5@B{c3^P$bbe>PkhiD?|I2oZjX7jkVM#>;oPFv@kxMJy@G z@{|%T5IGnCH8=PG#+18|bW4!SM%m0o&c!PCA$`07?)t>&DsKPamkKAp>bH)#OO@wm zJ~F|#gdz+nLSfU)H>(Xc*+`YlV^Uv;lS^tTfurNJ)NP`3dVkG0JQN9?KO#7}#o}Tf zV_A^;!r5pEyWy5-zOy1W4DgxZ5Fdj=Hc#cxpWW}kwy+guYm_a%nA69YlkN! zJEbhF09Pcd7IBUKvsOQly!AFJ3$w*MTxNxizp(VHPnK)NWWGCs>&`{;xi66|Zg!1z zV3Cw5^OXV6Lc(lBD$GrQv-ii@dLkI_RSd@<=seH6H*W>Ps&u$w4e=TYu9Zemr=hBjud-!w*LbwJz5Bd zyc6oITzJe%GXLIkt*Yg@o5_h>q;m<&xQvr!nXi6896)tzIbk;c?83bZKBaJI%t20; z3(li0c9=yrU*ndR_l05ftE38IOn{VNOsb0X20&{5ijbFep_!lX^vTnoXSGE~L&C50 zk0JKpPK~f9KlXrZW8aISpOjHnLLLoSLUsWy6hr(Qshp2iko)XFKu9cX=UjsYr~p>G zlkeG#ssDK>1R@uNYYX>QF{cm(Y2I?eY9fU7#vxT@|HpEW2lg;Y1aWe(VSG~Q6&O^4 zQe_n_yLXCyYK$s|=fg?W{Q;8d9@CN+$pBD%5~67DqT{lKnKJL9^w@GXKGuoxGPzXek~pq2)p73e9i zrTkxk9T9LMzr_VN&9?(S!FoTk63yc)on9AYI1WZlEVmw>Bg0jaX{3TlG|lIkOl)Ag zXVnB)Z^_!VyCPsYwh^@ug(gs;r&`g?CQRa(OTYraf7H(MQ^}au&W6;dfp&nPcko~= zVw)DYq(gmz{saJUsVI@tX=6gTWyWxcPYC#`4*FIyw)oG&ME&>}Y#zpVm01b@`Y(kV zg6o6j`AT`85rwQ0)i5%+4KsxAIR-FzywUj_4wkKz02#KSu_EyrbYEaYhXOp-hFbooINBruH zXa$ko%UC(yeh(Ko*J;JTo)!$3;ys!`i;*N>f-1 zQ1H)Eh}t~}ikw9`wLF7g6W6&G)3==;2jVi4AAur-OVHvnB;b(edluWAaQ=d%u&m2O z%o%FoqqlKYw-M_;Yu5Buw!z%ToMZA;0Yv*MAuQfQ6MI}QphE0D?j%YSU1>g^SOQII zbw8JBTfj&zGwG`|R~SOZ`TN^>Hb`fQla1(vZsJvuqqqkkw+L_X_dKyGxB{IH2rZ-g zHvxeJtQg!jXmaxmdO_#B&n*qkm!-%?2$uuZSU#6EWMe>!pd;!n!3gYwUUN;=`e0U+?$T80VwmQPZ@9)$p5n4HxXer|1q_4l|8f%|VIX0ZC*?#8D0K*X$SslS}NsS+KZI@f_I zo0gMY5quDBGD5{Q0pZbO81ckik_fN$eLy`p6MoGbW&K?SfkV?Nq8x0(C2UJaR!tev zS)R&ujw+}S%gWMu-?})r_TYMZ9sJqYcO2n{4N_e=OtgZ><+z^z9NCb>&$ZzA)|j@f zqJsvTmVpV1@$y43Y>5$Pzm$?q)NWYSFdt6+YZX}^1v-2O6FM-7%6)HU(2nvytP zeuhs=n1ofGP_qINT?9UpJ95hi1%%$8V z?%AxdK?*3GDw}J^9HdCZm>+TflZv$avJy?efBjdDzbAZ)1ob6-=DD>{_QNmR?jaoz z`hWkof7OP|1^2h7O)_re=LhP)V?v(8C0E?Gg=Hl!VMM+sHBku2+ybxaR_~&@abYX- z*%UPHC>oX(J96DVeJ^F8x<%esXk{|dS% zTH|aKaLc9=$n9w`M1B7z4n>DOytPdL@+$mi8>$v`nsEgt3?;=7+&x3!GuxO5h?3(N z?kWC6Ezc8S6_<3k%I?80edwneP)W-!dB{SSnOF11O-$S+`o%jvc{FfJ(K~6TS25z>eiN~#vk6GLMNqR1-HR% z8*LgEQhFOpxDjGe+%i*GXna-zWLyfr{{Btl?YJEhaYY8f1ny-8bHeXqq9&yzR(?FW z9D+s&x;)K$FlQBAKS^H*b)n+pF0QG^OEk=PwGbGh?t%-h##ukR9Ioc z+5mgFQAWkkGLlaETOmTXWm5rk&-QyZ#Y?z!pKYx6Pc)B5(N_ z!7>p19*0)^EMWxySQ*OB4+gL9$4dZV(bKND*A;OQZqFGA@h3Y)ynhxG-W(@wf2LL# zhFXYIoI>5cKT}a{zlBONAtrUGrwzi022>~rk0o!|a5z6_z_^Ha^o)I;sG^$I=L}bXk zpwOFCw1M`6wam~+Dl9Vy^RNGv%(6^IznayW(tE6{(5O*?sc-vtG`Q${-U7;8^4w6! z^9l2rUvUN2J;NWvXb>#EjKDb?w*{}x_W*jORJ?6jgLc`4>7*{3iC?hR@r}P~nu@a>D!UFpJ7ZhB#7TeK>TL3MfH|{^8UHcFZlB@_(GPpn>U3ahpSz3#s8hS1H7gu_r7o)ql^Gu6*=Uhs{wX z7IJRGAtR8y29hThVusp=GY1TaMD^A$n{kD0nq0#D8>n4?V4%J#Z{=)O5YDABpSQ{g zY1F1dIDu-lVAVKn^rV*azxeOO{1G0I3~s%LaP2e@($*GiLP&utIlZe;7QH=XVQQXB z*7m;-M7AiM?2CLCM&5SJAFa~xe(!y-Kdx$)(4M30sq(XKWby4nU(HPn&CpU)7b{`s z@1p<)?<(QCr2T9*ATZXkVs)VhkHJW9ume3~@sM9JdvIn^Fl1m&sD_@}#SA%b*yc^5 z<1Yigy$k#`C&d6a@#mpIL6s7VcH$v3vx4HEGSt9YUAs37&yV>@0sDLk_5Kg;%1gVI zy?=I~3!=r)bzju37-(H*YOi0s^aE!Yes(NO8TYbM8?j^Wu9<9#iwSkK6x#3p5Ss$$ zDs6!~s9rW`3dU8`u10~%w@HXSq@rB9{sZ7|A}x{X&62hF8N-VucJp}@7ff5I_~x z%)+f)#d9}Zl9_RQy%vkt)ijl6sP@UmjVM(B;5r5VN-}7SsZ6sus>fs(d!P`U`X-5) ziFfr^_;|aFqky^-eH_#eDqjOHp-oZRK`2TWm7GTGt@xe>54lo9@by?7dPo0+br5Dk zmGG97aeBy+*;W4&u|O5q-tmN&H^|M0%YgB=BYp^#=BAwe2CTlwrGUsjX{c&F|E{hC z^US=}mHjwmuq2PuR;+bBcwZKRRXv%9M7(>WS!|aO2jQQvRr0g(9lmbVfPMN*n6iEy z9>f=3N3{=Eth^a;LH;So=F5nOIyVZ5z zO<5wwzq#Ks1KlzDt(}c`frtn}>K6>MOLF{Uw#Yh7&f7huKP)$niO>H-ii(G86%LGS zl>G}K0K;u?4ebaSMy)@CrJ;Z1Z{|+i6EsmE?+bnJH^DbURcpqCp zz~}BWsqN7mFV0pLnyPG#3YfF|$)wg24Fw_eTUBG;XSwa#^kP11#eIcd?mt%{w6xD< z&an124Kg(MafpQs(x|xON)2&7q6N)AoSPxJtSru@wr)`a^0!ZtF=d>g=a-Rda|~=6 z_<_z|DA1Uexz?I2$uTmkPOvKebe|qjSgdvlwJp=h% zO8fia^Hq9!0m#mOEGk2|ia+xsHsZpz7H^fox(zNa7fuWItS zZLS<1^QtdO82)395-af;>se?3ErREo&n@IFf!TcC#wv^q!64wM1_YTNSOBA2gy%ET z0;Yg^8wfP4-^D}}5Zn5)xfPC#awXuf8JkIoMz|mZE`@;@hS;+-{=MD{8IerflsSW4 z_N-hlU!rY&X%*%VOXuxBT;tDM#kK3=QlOvkr82J9ESz;W|08Yj3-0~mB?=<>-kQqm zOKQ8%;-*DlU)_UX6`VUvP$X5G?X5*dwd>vtZv-y+m=%ybbHylj z5+A-d^71Es%_chh@=Y?bGhaZ2#gZ9jIYUoWJYNnbNmUSVOTIw;vf3w4Pl?6mdvkJ-@Hjx9HI;q@8(`_Bw#TZ z*^G>*N4p6M9Hlb4R5-w*Jf7!ye^tQjVm;X92)}+5*7E0;l2>Eu;TKxVyxh-quy=v? z_`WZ<0|06y;k57_-*m2;2~I^xs!JN_A}dv3#``mvcdj68gt#RGSBur(8-=@tfZO~> zZ?YB9fK68WK2IjA)UCn~;YmB21Jzk4SxB{0l*aRu^p8kk>WCy_+ISBg+E z6nC`g&G~)T`{bVai5W17+8@nB8GC}7=(qYKVsS(3-%~dE{|RiHVkk2|cdops1;c&i z32_5W+uL=`_;MR4fSZ}59IsMRSG{+W6v*O{+$SrNqxr9iWK2sSHjlX;O(kKT16of2 zpdY*G1u;^u>nxX>7&_P6Z6dKweIDi#j|c;zUGCl1Z)9Qi_RA+Uv2vX*GQfASKbxkKAqF<0j>NdMJl{`c4!zcWO(*N7VXQP4PalF-BTpe~HZTRNi zDiUmEYCHEuQgyW(EEmq_FVoS~P+EwLYUAcPCc@AfP#KzVZs=|o2EvQNTGG|>;@P{w zqP|4F8gA%ADS*UHZR*(=TD4?hqb9dkP+HrMjA%DmS)z}WaA+%VP=hEvUHoih9aLQn z5U`UzhPxTV2-5?MS7*5$xKw#@E=zUh;PcfQ%)1ot38+^#WU4^1i-0Ck?Bk7s_-eO+ ze?aB&#t;dU?|?bZPv`q}0MLEp{gSBs`8*gkH`ed!CO_I|t5eIc)#xITy9SA^6b{P-Q=c^msT}Yk;Ui3)Z!SoZ z0fuxI2U=Q_LcBo^Ygnxo@!4ORPM~2TEpYP%_lzfb7e5vEl*uXx8i~zNgR8j+0n`dE z)9{oJ3sBojM$v+@x7oF)P(fNB+X_ivWDqH+& zt6mLWHH7@2RG!bg@Qa15pY3dNV*p_OpT!qDYkUcIRqS*Jxi4tDT5PftIT9a}%@cQT zS8YLH4e70v&b=U>HEb`j0A|NIiIUO7r}zudO0!LxBY~%UnzGnYdEQj#92`QxZcvMb zNkc{%Z8rEvbntadIF-UhE^Ff6QiI;Y$04n4C@$!d@A{pu*Ac*!$4{;9j~;9Y{B}ZFu4Pg#{HRp97LIpQy0Gp-L=F|!IuH^C-uwiIK}B9Tvlju2_nyCn|rR$)t(H`)Ge0zvhzzc zesW8(z*WH1=03re@Ut-FU{b$lsG)*>)%!Ej<4jE%6@u9F*85q;|N5_sOB7wen@=Y*#BBxfQm z%WlVzW%1vh=9J#j#EN+~hP`oERb_yY3Jx?*kjkKs^tFdAtP}@@4Uzx}g0s)8U@%3- zep~+_>3j*lsca=Htz^r+uPRWX7@}|u<^NC9AFEpQHNk^WeP5t38dT&UC{&_o)`*GG zbY{A8gV}E|=MCn*!T)52Zr@-%LMH};fgLD=fmo=)2qJ}{q8OatL^rr_?>T#~)m>dx z-TiA4e{#>azRgsntfR3YRf*oWS_F0Bpjdy6e1ZvJtF791g(Bbu3k%W{JG`+4r%YGV zPCE}p8S8myt@i=OKrD4$a=D}~TAo5``LKj~@ffepL}276U0XJBC^txGmcj~{NMIzY z5OQ%*Wi6oD4vL0lN8`I8xQR0sh%!-;7zX{vZ*8U(y2vvIyI4`Qz6b|G z&@~%zF?x*tpq^%3L2G^hEA`YCmIcSSTxzkE*69x%`In>Z&X*T+E)Swx*-fFN-b$k| zjAP~W--@jNq>8L_H#=WFv8Xb+-&B7!3qJe;wN&AKd`Ki$fYyY6@X~ce9$`wntg^p5 zwaV-}%GD8;2N@&Ed>15{1R=vj@RVUD7Y5XzT>uGanL6Z2RPhD1rj8mgJv80>3mm*p z89W}O87D91OVCCl(zUTM_0D(FOf1!27+} zRGB_QUt|CJ1=OaUz$xA;`}4|WQbT1Su5^zbR5e7~r1hm(#=mp;4_%=vR@M1ai*op8~&onXJ~R1_|0-fG&tH9p}gVa``bn}+Ip zxG<$FG=+9y42guzwBU;NM&gWiNdPpzLR`#8>-B3wfYc>WZc5EeyU zQ-G1Go8I2GYkzG73TuC-U2x8VTDd~&eXgiP_XM@;vj;Dq!7WlNgsWNxx@QC$ER3Oj zG+W44rS_jX-PBbo z13WRT`bJWOAaynJaq3nDSYh4guLE}?xNZjq$@lA|$ImeGG@O&)p-qCVS|sfExPVuZ zb`LL6cF_dmJ^$Fswzw*9V+bV85X^l3&d<1mvy63nivh?0u4RN#uSN6?rt0^dsjNZz z-^MHnTQouMU9_PsW<=}%NG~R?MiHcm^JaCDa$71q-D%dcBvU-M^s+srEOrDw_pf+F zF#HMUqNf0})rD`cHKQfvXkvyPvUb#4WI;C>?Vhs^6wGpv4^`uHmjzsNXF?ajjWSD* zadR(WG$q<(YDwQ+V7<6)v)uz=Z=SL?j92jppWECs7p`(?G@$P`^F3te04t&sTH?cv zt2BRvRlzUUntl;B;eR@O*$k}`695kPm`Ik6#>pQkwm$U}=*lYci$?%e<)O^C7Glm0 zs6w_Plm!3d!T>cIqese`DQ716uy?x#c?6_M=)oJ1A-h&8P0oWp85YdL*a>hX?;*KY zvxp{Z5|`zK4ENs648Hk90R;N`g0fYGA2*9E-l!B^6NEVxL}=foiFK)CPR9~-tP%_V zw=l|ViKS9Ln!M%~^7@8QeLGrln)Tg!u!}>MQ?mFHx`-PIgRM(gakg5sNI2(Fw)}>9 z3o22%EHi2Hoa}+Xpp=6{FBRRs3psGMXAgbKNbSaveuDEui9GPCJlQ8xh|_`g+7dzy zp|Y8zOaQ9N5Z5a+%7cCK62N{+fd65WWp*je*${Kz;2grCq%|q&`8T)7gjO1EcIQ{* zr627u|K(knm0yz!Me!BnE4@J9^}K-=7C};+d=33AGFxS`GxNOU^&&)0k`E2P$*Uz`Ym#YfUJ3J8~EdT+<3MrFAgydp%9??MMt_?2{ z1zln#1=K!m6dWLK{=*#M17aDRr;J~URO%Fe4PUnwj6- z)}~MX&qNI9J%3+{K;rIw?QAOz_M3VE;ST9mgxgtf)w;N2QdNf?LmPkpar1j@%>e(g zMPSj}zZc&Lw^t_&A!9gNh-S>|n?X(&H`(3t`qjO!tU-vUfnxl}yEWWN{W#n0Xi$aV zJBtXVVz_C1+DommFYKxq@O!lzC4}n3AEPzbud5WV2dQs)Q9@DbX{WOG*VZQU-y<~G z%G?pdY=QHWK!e@_K2}O57f*Ka6rfk;cRTx;Y*}Vf%yJ1-v#Tlue{z6JSRHP+#l$*P zU|{)9nE0)b8UC#!p=yseL{LKxWLKQsNQKRBkfQc5Cs`ETVaT0Q9V`C~FBo=2P=y^k=aNHnb15tfZ!z1!$v6$bq}%<9;DU&YOQx6g zr*fXZN-w}|Hz$=a_&G6gTW@ii3=C>2fTLPX=Z9V+i@ZL!6=EZXOPFU;)2^HPQNG-R zh0m>!S+`(f1?^7~s6j7lr?Oq41y0R=hfeKdU=`9W1s!)=-R@bby|b;61{7!`u@Go4#i)QxL8#3 za}{{bo?GLpZf)UoK*DdO9b&%feRFd`pIYt7&Fa58*v-u8YJhk`IE@;6A^PRggIM$ zCC+E~(=LWTFVRrpbZT9|(thIJl#w1S%gMOM zR+Uv!asU}QvybhArF(11MHxixk}%~avy2%k_XOo06xWbb(FmizmzNymc>_0a2A|1D1@HJa+$CYI*Kl<{yxjCP`=NmYPC^l1?PSID(8JLI2mf6 zn|+K>tMmOyqI8D=uMcJT_=o-^;};n~07z=d%QNjEep07>7Q7#G%B$j8S=CwwDl)J=y$L_p=!Ku$vn~xG^h@Oy zP=76i;F4{Azq`taf@?ba%Sk5%7A}8($5@|_kf|vw}|vhnYt8Q%p5LLd>fIu1zY5F-Bp=yKd)#SK+r*~IlRrw zJ^3|gnMt{}EFjMftp7ULDgTTA%@nuft$N96Kv;&^Gqrf)?YQP-iGCq$qrTOblnY3b zK>vgPE%q^W6+~F-m+2633YMeTpWA>*xNK2Q-OsIcYv;+7awX`iT9f1){_8)dX;wx+ zltP@}*Wo?)VA%#;(YgX+2;sJSO0&i__b+fDts6#dYL#Kd;yex(eIEj^C@lOLq*B&B!!U$(&-R9GaAnk%3(J5Z za-PHAqE0SBnXpFC@@yjBvH*lebD-t9+_SxgbSrJT5V)KGp<*|Fw^LMPkW0cL0~r9b zO>X(W9xiPBjPub}+zCw3&s45t+WFx3O4i;#xD@1^VH9Ce(6F=X*1yfT-8*qV3 zyImt9glkX?;NPvR(LNYMK=~hbjJky+MJ)4!7QLSKnX`(+TK~O0N+|oeNa`0Wjj9#0 z>>4fhKpP5g5lfK$MFIN^47pa+?%5C(BPU!Yy<7VX&B9dNfaQ_QoR?>_kJH1c8v4Aw z&)=aIetVB$v5o!97bZ0Ms<@ z!G@4*?dQ61drAo^0V0>a+Rt*juIgoRwE$_rqDE9;=HD6!Tr+7QAkay$QR%~JhDrOq zDctfR$>*ulO5nU!!2hxdOfLigQ-(_F~Nm|{X>JNY?HM`o3u_cePhr&KG+(u zj?B==md!PgZH$b*D7%%Yp$U z`9_FO_ydDaC90GLeR2Yd7h6H*rL*rcWf9iN@mAQR>He7;dAbw;O2L&yT$3kNy_7*z z6rea509Crlln^MVi&%(bfu_rS)>9i;296Ex{w}YKe~i&G+tBM6X0ion>lnh-<-x2| zZ7IH0y>Ijs<vd5_$^dspjU&?sQvzc&ImySQ^A zfh8a!T$IpghNt(iaH-Ap?$oga(rkQ8P7H(D+Wq08fFB~Uqf;=D`y<}i$Q8x2^p@WJ zER&BFmX@;fLoFeECXra20s6gp2r;coJwPslxg0XPDXkWac)C0ckwg%LzTkb9fJX@{ zfAXH&DQofZ)5L<>VZ|Gks z*gwcuCmzOAt#NHYzMoZGt+rn=v`=Fy;9^mdM#?0*-za`t&Um-kNp-PZ)dZf1#VnpXw>y#h$GPgmPHc(!ODB_r&> z0PIAh#!nOgx!{0WwJu*B3=1V}<5pa>p12+Xr?f`Fon#+yonD7?f(-IAdtSy`_I`Vw z3#}+SXkzdC<5@%2!r!jskS{aZFd+r1);ud$XC0VS32WmD_$#<7Ttqej*W4)6)%swH zI}|*jB0kI~xf!i{DnppB-fG2c)G~Q1ZK2`V252~}aBJAV5)V?1YZ5+x#ds-1BnbUQ zr50G0R3R7BY}(VkrY8whD?YsobMG*`;4%;TWsfo(zgSAXNQ5# z_sq50zx)Wg*rwmcZ(hN%dY_)yfYGVL9L1d0K0x`Y6~=)0c4%(5JRSZxj>)da^yw8| zY)F?2x#yLLdCWpV3myM)EY&bpr6m9c_o{4%G~ZtT~_v%AAz zIv7}`C!2z}a~JI8!%Wp0rcgvm=l}Gk&mFCa_W&awY!%ABNakso?7YSy3U_?t-7kDD zxs}2M$laT%iaK=xYQe0gP=x_{dcQ6Hft^Ym4I5O=OA3vl`LdV z$gMdE9mGfPF<`3!{c@!Cb1?;^Mlqg=RptfD`8-=#&{M9(_N#D=)cUD+h`tg$%Oz75%A(NH>!P#g z3MNyxnM7gIuYUUlw-?;!el9efd)h-*V8UhejKs`miGnXW*8YvGtgu8cOHYR;*t$&Gy_k zQ6iz%_`XU&)ELS_*|gkDP9Xy&4L9`iu=)g5s&uF`yOLT$pNj()Qbhm^LLv_mLc9dZ zAJ6*=1W8yKzTNFxn0^4*y}G(fW>R@W@LLN9kRZ>dT(C>ZMvU_J@~=Ztj6_YRf^$W zlQ!PLO)hZH&$9obKL?7RqT++aCZ&`XPwvs}5}?6+?GWel0g*FO=FK^M^UIQp7i63s z)cN38;33NZD-Iv#A^TXiT5yO`Sn+ZzS-x~vh2`GV?)j2sJOQl5H5o4Q5gukhK^f9} zCG?f2ga4uEW3Yr%O!uwTl3rj+C_3p<^fKvKUbE%-WN5blI@2~lfJl%jOqM;;Et=HZe!PuKCluMhDU4A zyLOflNo%tpgow&4@bek(jed5TA~!`&VJA$n_zNxDM&IW>>^P133N~##xAy_r0qBk( z%CRv$BCKpmq{<^{2P}wU-`7m7=vReQ4!|op<5~qwPpTb<_rV;HED0xTyxE0K5)<;N zW||gW9H&%a`V8=+#v4Sxw$_{ z!Iz{49eo$P7^A9vlbXGtiVCExv@yNQO+`OD9l1hymN6?VGzWl^rF^k?>?RT6WAHNn zkpefzTgGxHcSW}eS!!X4olt(HRcDyOe6rqR`G2(KG^fHMrwz~4Tw*%t0T^Hj|LZ@{ z?)kMm8mjxe;@`)|+t|pc5m&A`aZq``j=^PH3hWBOSI40{DhEvM&`8q*r7i@x5G9z$ zMsb~IM5io2V{s%S;A)TiN&`enEfN_A!-V{i#r4G}dw%?7_Qd40$pln#`yVG_Fl-J8 z_{=}pshp(J@%?U)3VuY;Gf4``*8BHlg52w^Pdv2jT+`Sp+rq&Ie>7BKmp_zMu$4iZn{D8}O@8-du5*BWNT@&!02uhL?Qe12V~MB-x`m@;a#w}@ zw2|4#1lYxC*NvSsSOW)S`BPhdp(~?lRIiSPLbC})wQea{{61C4JO};UfC$OLhHANagszKrY>wi^$pSx8n@~s-{HA&Y_!k=R^iBqzrw?iPS?1^9iYI9dV!ZWGJaEct& z_O2zmtW{(hqTmkBGD7ipz0(tAe69-|hqYGlQw zkhAFa`f~-;=3Yx|;<%JL+1pH*t zi+YRDWpNmp2t)Uee4ok>h~XoPr5)s9?|jFaoiRQJU(7iTiAurjSu6 zGeK>D3USq#a`F4Q>Q@-aisHT%#RA_GZj(1q`pU;zjV#4yNDB=4o)sd;m6SLrn?@>N zz&1xN#~mUa4uWGRN?#H880U+_DWeiw6oV1xF_Ex-R_LGQ7Lv~G zMWDhWC~}0|Kki6C;7Y_STElSq9)vwVwhaDVKNyAE0br)i%{-~8tM4DTVx~9f0K>4b zw4C$v2y0t^4}!HnGDbL8|0ZgyomyBis;i0%BFL!|164dVe!Lr`Ug^Tn&FtLf!Y#1z)*;?NZho z=CEvjcPa)`m#VPd4qTEc-uin_h-xoNjLxs%e!Q0-=kTcq6v=h976W&oiP{T1NtK46 z3umY!&AFrrX%iC&5HtGNl_D-ijUy~whr2AbxpN00P;gWVyk}VWZf^h5+`M3CVQQhT zHhhMTi3R8+Q#cnRriwpj{}0|kcaR;_fNZVT3@BOio!CFf7-GDSfBJy9|GY5j_|owA zY|#w+Y*o?N6AsQD*UxQZREs`1s|6*=C{fH}?xKbBCKPz%v3&y}jLrwR5gT zb=ko=d69q5O*S>@?`>_y3oGGPYG-R>*c)m4XXI$llXBt~lNG11f7PX*uMg>~Qh@6` zqUjMfT3nWt(tf2mKU^vQ8tj`>d|7Y;zF-odsZ_&pae!X(h;5y0|5fh=6U%#D2U)O; zmP2c;&VH*lflosTwo%wrpwj!joo67}CK8ZdkQ0TyG>}$+us4{v%I3{V=moTA@GCG_ zhk6!85!T+GCmkw#MgVq8a!O~Xibmo*G#`v2XDlPg-QUmsZmqCaAiT?NY_i<_pNUxc zBN53M`0C{??u-FLRkqh4{^Xk4 z!~oe;ndkT1LW`ctoqW2n{g?0^zf}_&*4<#8 z<2l$W1spX_ijCBb6TwgYhWHUK>v$l>#vD4uKzl|}p!qCovDzicZ>9_+7SrDDeST;D z+nW*Fc8kSiu^Qka&0P>)VUB#>4vQ=)IR*3XNj^_)Qj_X3>Ag|T#Jx0&+{ltfa$BP_wQItkhrQXT=6;6{ip|0dfZ6PnR8~gQP~ho#ql-+um|?GTH){{nBSD za8J+S0{!jj&=Fl1XbezTD#s43X)f7c*2_T zpwq@rr2%~mC*>4f22EOVVe&uyZ*w|lVz5@UQrFay)Tnrw+OTfv`(z)>2?WduqubM) z&+E^Zq^obva7qqAyAfy;LJI-=41NfeTp2NRn~;OPq?I<8{Vk;D~O3is6dR+mspKyIrcEuIEI?HLFTd~4>3_zs0jS*Kt=v2zq ze1@YZSpyCDXW$JvKy9yk+eBbc;M9B;G^gJmfdkML&^@~1?-cTRIVw|D{XP3*QeCKR z1KFC=+h!WSPvok-jo|l?<+hX~qg0HJwF!YZPgb zb3Zp_!UVUcU3$0Vv%gjWZ}UB|0Q7PxTVcKDitQyLxb5@#4md0`p;p=YHv#|99in%u z^0l{egMCuO>;wphMK-c|j9`gm+S=n(&E;#jS62j|rE!2PWHxpklK_YZ2BDCraEqdC zf*+Px9}>1d+7B)Z_~HL%U zJGrg#&pct4>ISx?$fP0}Yj5_=*ooNsy$|j!nbVHJxRL7q$oa2a5ORo1PX<7*lS?P- zEY}@|?@w7~m&%F2RgS;m2|)B6?o{UZ zbKneiF!b^;JQUmqU37*j>B&KHVD2e*EwpDpOSaPA(OIlyA*!5PabQ2ss?_z!E%|Zf z$|#m<+j$)^Oe8bQAGx$!<``iyJqo1_M@6j@U&7?3Wc~^eC!d^^+p>!q_t+8c#K=79&ouxN^ zu>n2X_mnB>mIsLzu}9Dorb*teDp5NDi9CO})?fRKY|v0yeov!dn-7-+51-+hx1m(> z)g#v0Vv(sR_?co)rL%kbl)Zbn*0LFEihM;md;~1;k%e&UA_9tCgR|dRv zHbX&GlQf`balsUQIFBL229}|K6<1KNQ5DI1yz4O0Dy!=Yoz7#)^b+PF$%?Ko%%-ZB z+|6=Xac$-wE(Zvt9^9+IU##s_%r|v&1e$GBltAw^q0tM~fs6p1NyDUXHENzm)St(w zdQmFHG!TJN+3R0dGPlTpH32KjZR%FQwU6mwMWx>Pa?y{xY_a%x3N#^~ziF^h{j(rJ z9{@%=7H*jon-?0`Ylo)4v| z{X02CIc0ZHY3i6>)m8-@VY&BRZStxsaPBZf9mf9Ge+Emz?;dXvk>#?L1kb+Jh*L|U zzVh>m=W^I}ExRGR<|IgpyrK-Jmk|%fM0>mzkuw&yF`tCkpsZV!xU-e|vNX?F*^;KB&2Rib0n1e_ z6V(sYR%nDtfByyqmZ}M96tOu!`i1QP%d@FOqp3DlsH^#ySC$ZH+i~?J>2M1!{k4MV zK36}}D`1l{0XS{u^ZC5Td~pz`>rGAN$?f=Or%idqdAzJiaVh#%Wo4l}f193ASKyXn z*0*w2959HO$mh$_^qf^<2H~%HzVx0DE-y|NDYMq-+($#7Kgn(T@9o4aXi{&RkEcxM zWTP79yrk=jG+oHq(8LlDfZDochV@f!-iovl@g4)U#8&z~UrI6PN(2zua@e;MVX60Q zKFj{H@dD2>%usVya>`2~S|3i`4pAW->aUs5s>@E0eRzA}#)7C7Z!>FYLouakC-Y?G zA7w9WO%rzC0SqnKJ5HspdZ~?!gi2+}xPUOMXtWG1!6?5^ zjQ;5moC3_=@+Ad}fzODU_MW)H5&f_K5Ytee=~%Z?t7v)%E_*8XER|`l$mKMw`TR;m z*1)ZJEfu{~T>xbyDdc?19(Y;iwzcRb>Jm!8_-r&qD;03L3W@E1{qLCG)IGPsWw>O8d#(_q;^z%W<42GX zHMIf?+ppLY-p|jh#lVqkiu!vt(AoVRT*2(`8A0zseOU~2QCNZsRvMPUC7w0!XEX&Z z0WuEJF9>Q$0B--l`&LE@724h3kmm+v*6vB>)J^X%UYIh|L#Hu+f2iMI0rzj*8eJM^ z#1w}hEn_=yJ5gK-pD|b)PzWlwkjRRVtguQUIRgX-al!WIqA-f13IG{7J7>-^^`gW5 zMXtvh7jM5M@IBiXpKP`j!tGd!`)bZM7=8Qwn~G~{o-1mZt;hxLKeJ_k?B~j?mpNQ6 zppY~Pkl)AFV*k9Vjrgh(D9YBdvo&oE`~>q^uJHY2k9D6Rf8W?Bu>MjJ;5M+2Q-d(# zvaqi7kPDt45=y_HuX-8ujuG|E`MWfjs8Bj7_Ml-pZVda4tPH0 zs#on;0e-xYA73G9fPj8g4hfhAXHIDR8qAaIF&PA9xdtt+Q_B#E_t6T>{)|iR?rj-x%R=r3TR}X_du&)mFA0vMKb{E%DrO3eKP1>!Br!ZfkA35 z#_1n*$#T$B3-ihT3~0*}05+b@LRyixEuRT$+ypptsC~Xd$RE^55I~l)jZF;-%-f8> zX=8h;hJ<$8awRZnGMr;stI(hC@0b(pF_|(!+;*)1E?nVO-I~W!Vc=dx@h{GhF~rqs z8WwRYN?4;|RWz2Ybt*Ni-+xC&fE$U=OV!2Ekb1Rp%K#FEODY6!YCFHzN%T*Q903GJ zLJC7LXHwq!$y^3t6_(31=Ujy~-0#;<0yq?+sTBhmaLqG;s0Bkhz4yco8XkOd&Xe2J@IF{~ zVCH!#r#TdoocZ^2>Gl6OCHx-QJ*l8T?$SW-eyWZ{kz;%xVQrDHSgGOuD4GDr)wMec zhVr0@w^JC*A_BC+Y%d*p>f-xbtaVvrM*ajHI6&%afdIpJ&M*Ef&7EJmiZT3%bqFMK zj0I!Z=4%{qF32xxqGB{DimW8v_EF(@wnS0~0ihS0i$L6m#nXEuT`#Q$WtTp@vCZ-7AW^@dh~ zih|BYJYDENX04&YkGr|3E~f&^G@572(zJE2g5vqjOC(s4ei}VrMreqLsdEpHkq5Ie zKkY4+2y{}=H718eU5TfwE6w*-W*+rEh;%)LwV*GMVL?Nv!kWCMzGSgg?cMX=cp3AEHF|HIk?;^a1*J#+)Ywbj!^vgV(~S18#p}M=;jnYI$>Q5u3i#l z;MFM~+$G4*{`gz!0IaTCmA3|upJp~GTeZHZ9|=`5Amt@lCnxbUZr^K^2eQ%R<$=p< z#hd!`zi+vyG}fF;p{CYgSD_LR!r2^jBhyGEG3~>uXEr-1+dE^Q-4DC3=V;)_~X)FB50B&Quk>H zd_GXOb0D0pPiSjjId@=tdV_@g-Uhf`6_^j95xb1^46|;Rkra}V3^oKX%MnQ$tpT_1 zcgkyN1qjUZCZy~J8EEV9RhP9BQShEs9;;~-?jCuVAYc+C8_M9bMgGMWZYYvN^ZN;S z@J9q&v_1f~`GDjS+Z**wtmh4Rsjm?q2Y*AFpiz? z*A(kF%0X19bOw`!tXAcF9dK^6_@?d z9J<#aUkR)1My}|6oWMgZoR5T(vh^$B-0Uz-Z6V0KwaiOcHjL;riY?YHA7-rUJ~-Kc z-bZ$>ZI1amwN2Och5@SgrmODZcQ7OauL4yz7ks9OGz(?L_0Tl-ZUg9CFx&dqe+J}S z`WPdzJ}r>iL0vv}DtjEr+EVV1BPY;rnppihGm!Vd=@9EPO$YM1?U6WOn+Airoh8I= zTTxc2?SaA(G{tbLjF>3QW4L!2!DaE5mizH-nt`ZSKV`k$KMDa1VKC3943W1Y(U)gQ zY{+fs{VZJiKK1V3{G|Sk&JOy1-se8A`=2Uw+{A5K@39!uE=x?EZ;1RzrKxEXsn4H< zaPYSQx7SMj@C1?&#RZpTng027`&$j)?{WlS6(|w1&&Y7QZ;xbdbAe$G1kk}_;5ML| zxB|?X!k+D6_85(DQtyGBpt*$4IE5udR&_0Z@pxjPtEg*%hKz zoBVohQeiDwyJ8Kt4O(y^&lpBGXHA_%k$mE>dzSGFuLL<~?%{ZkA1U;n8_1QEsk zwi&tCpGn^`6N)gm2y7+WNKCXB9j@77Qp0~V8%)uRZV|O^gh`mTSr-I-zSXy zjl30C@fqZKPXa_VS9HXrGKD?(J|^a~G#-Dy1O79Jt~d$B{+>mf zefRS|SHv*Vx)d%I@s+jhP`^g5jf$%{sataCJ%J^UwrCL!5dsWSOW@$`1f%vwJwPDd zcHLvb?H?HW=MI1xlm(wFeLr@%#mL)W7{+L0iB$usD{6!I*?Q0Kjca=8#}2u*ooIpn zs2S`Trs$l7WeCiYTL3d1MFwr0a9GZ7V2@s*R}9V<0{r}y5;u+hmgzLm{f}T z&3T*Jx~SNV{oOpkas_}pTl8#5zH!Ou0rqOE$S_}%^*EN3Xq`%g5OR03;|dmmtbI)v;E)sk9?ndmNaXT z(ohZ{y=@l$J`P>l&ZJzH2I+y3KaB1(mWJ#FYSQZxu#vX5FY2=OLY8^H-*2ThWmt?A zu!bR-E!v2dot40>uyp^USdpTETD_G#>qcR{lvDHhK0^poEC0Ln_XDL?Z;3f*>)a&t zfL**Rd?rrl5*GT@!L3f;=TivBY|!h$xBxBoAiCB?{H};mYgb+Ft(~B{QdLr%V#9PP z=(F^VqY?4c4&=Ta8cQ9|rRyfzNq%hUY0K~hTeM!A5der0tKPewe|9GlG zR5h96+z3tArS~&oe!C~SApkLcN^S8YSYa`i>YPQM5M;!Nkm2P~3R_{Yr^UjvEizj_ zF}wr2kRr;cZ78ZJy?1{YMO|V}ldc~~CseCY-h_6(LOzxWm`lJsh0KduB%B>|jUVPQ z(|~jEAi^rlT4j>N->H=C^7PINxN!zwmnjFpI}D3_Snh}c9v<-DgG(x9y;F57FO5v3?L&I^?HF(GnXDG7 zcq?w1?!VW_{Zpc&+D4-dHL#UwR!!ksiRBA?Xcvh`ixJ@kffIj%|Nd%wV+QHyBaqt|ZXwv-*dGa2g?JQkEw5ii9t2FPMz5GmkJykzF2|OX>sw zDx7`cjkup(1C<9^1~&|QAcqH!SX_2AJ}l7T3-4p%kkuy^VTY{5retp_KdSXaK_H^bAg0V-GfSTlE#xt^A(a7PSA@UD0t(CD@M5cSV!rpt*i#IvP>nIc@5 z(b+Zb7ia}_Y98+50%Vvc*K$4xU<1IJ5VGXN--kJ&d|p0sh8&B_o72qqIkO)9Mj2;n zqi;IQ!!-zN^YLm!jLsZj-r6U^vqNy3)ZP?|Q?bvCutmU@s%Zamn6ljftLj4Zx|SMf z%@nqH2w#`@WbUcMtUG{HVudq#F!ugX;G*L~7sWe!`eFdN31$y*3l?VGvEeGvs8U z=Wpwf^!n8qWBa5gtn=Hz=t*<2w@jdv+-&W?{(~(ANE=uRbl)>BB(?j&_SP;_0QN`9 zleglsxEyg+5eAkx$}fZ7&)%OF^8>&W!Uy^NI+g_hJyYe{P#p4HU7`JblAMiJm=ld3 z*u6RP8;;j6aoiWZ>E#r(f`2hJ>>p*f9oQ;O6r8`NM%ry|&6Yw2CWw7jw<7%VSy@-m zR>Jroag>s7-4x~oFmkYrrnZnA|BO_{+}acVnoK1N8}(liy(afe0qcquy>QXBs3{I1 zZ$bDih=s|(wV?lQ1$qf>{XGcyynufF!*c)(q+8^DeqbB*KGXb`vhAPk4N+1{pR2gY zNpTf%1SXoCVU63z3B#1jeSd*B{LS`tjf6#IpX{%~SsD3i_S^AUCRwOu7TaB9<~;sEJJF@(j{3o{N3v z`3ga2(ebvaTUcq%d$9H6Z6j>SG__${!x~FVWuDvjGqt1(CS2HUz@5yKIg=+#aPpY~ zGQDBjLi18@n@sy{@QT=nn~L50um21ROye^If@ys}SU;oQ6_PfxLd^;27&RxRPd4l| zg%$3@RRT>f!TaIRh1FJFG1@e5jr!I-^CZD@X=;FaJD8JN$VwqF_F#ojQ*(j^Qq#Zs zJJCM0eZHNJ|FIR$fdyMtB5KaY| z0%>T=1s7~8Q?Qt%mcQu<{;kM{!fjm2N@Su;SLDsfmbDZUakrD73ChNCdC0)(TMi2= zfSU%EzX4C-WIGZ0qZ_`ZXe0KJS{ z3(+{3oNp-UX%FE`ZW`UhyVk{7@7{W%ic;z zI$RWR|LI|>T-Er=Kt2}=iP=vX<|u_dL%NN)v8j|}_C^T>IU{Z9TV_p%BcKfs8&I?` zIj-roEEy@hCHUX4JC&|1oYc`A8_$*QN?Sb}Q?-7tO?08hs9K;3D8MoPoPuli3`I7E zY_KWLV=biooo^U*2q8ORfjC zs>&t4WoGvR8{GZBO=?9+l#sDA6GQu?4wMB+iVCHVNG7&wE zxp{A79eJOhmP58Ka@CX`l+&s1${Uh%kg&mo_-k?(*+G`iAS@hG$0D;gae{Ye6`2qP z#32$AMcR;VZrJm?8^J< zy*y4GkZ7d@KD!uNQ5I6xi@+%gGl-9+z4kmztpxU(o@4j4wI zG*xx}P+d%S_&qbEPzIC2Yks#b@BsJyNiyQYUA|5mu2bZg$}^&-$v^+7#M-h_i8jg7 z=aLVur3g^~HYZoMeV-fA{I5G;-vR3mm>4X>furw$z5^;b0)n9kgdk1= zVtwD^ws^AFv)7n&jIK{r_pn{aaLAu@DroylH&X9UPR+nlWWKsH*dIePc1Cn=)Vq6p zJ*TMu1E+FgoXXsi#q&$Ga%}Yh%Ae|*UM|8LDP&7VY6=Hfr|1}5v709R)W`XAq<(ND8R0#g6p8F)6 zU2dTOnTXF0m%HmNF;F{!x~(4&lhI{vQq?JN;H-<-p@T9a?Ttt zA^{p1sJJ)itNu1|&oWwB9QnD&Am9EG@A}!ph(S&NN=L*fE|hN-A@YPmVEtgYPYB&8WepwF!>Cj=@q9aUh|M1U47_}AGFg0`MnpxoXzG;SI+ zu}5E(1tW6fR-8P2<<|6(nE% zF5*(Sf`Nyg6)d|8^%fbU@z=O16Bu2)i?gH%*VBTn2ErCAkj{@6d_2BFFY z1A{??H(Vszvwxmh?Z?=N=@x}wtf|w1NXfN8Lx_@g&weJrPTZCRBJUBNWmd%S6Ran7 zVYdQ;3szM{KhDPYAgudp1!;Ag7fU$^WLqqhZ}uP~&o^?cdF+eK&8r_9heeS*zew%} z9xTlSbO$jDHg|kX#dNzum8gkv^YFzzu~bu$S{tN)F$6#CmD0{y5Oy&xvWVj)4Q{Cm zNe4J}GFn@U{a3sP_oz)>=BS{l&K1)

;eRB7JPR^*_lcM5bVNa1D1Iei#(PuMurhu)Ub`>yE5?n^9!}izMKox zy9<-cchj$p%LqY(Xvvi0#vm6lFuXX^6)d+Ga)*V+dSX_zoN0(8MXHAdQeaRSdHpj0 zI5+qjNkG>JhjKVtpa>v8I+B@d2I9%*K(N zG%L%8+G3pr1x@5hk}?U!PS8Kx5%JBhbd^PWF-4DiZ=7O*h``OOhQo3wTz4;So^K(> z%l$cjHFgg)yS}XbdA^YZg!=0;gXLR8^`aLkxAalfM6EO56Qz7DHeQR!%Qj)-i>#&o8Pliwz!SIR8Vych;IyE z71xOw^TxzKsLh3oa8a{Qv*uustFYAacO?|frXwDdk}|#ks9;f)jsvDxRFG2BegjZz zPg$KLyRar4yP5CH)R%i@NQEjeadS8!poAB@u{MEc`^8~mh4;uFP{lpwr};m_7`?q~ zL1E>{MRIF8giW%qs>ZuZ-5q;isyP!pqRs(zHtYsDw@~ySv&iCOdksLw%6aL?$pBM; zeNephND6mYwe=mHigya{UZet_E*E{*by>TpP2k2#h#uHI^;Q1uKWh6euwVVWrHC9N z+|~-1HZ7oU7&+})$=1rhUDMgm$V}x{sJCSp|2%$go6j0$P0CDVn4(cI30u}2ZA23z zf$?^z+Xl7hd#udS_ZjZ9x-3lJl0!&upo)!>rMKL_EG}qTF2Q+*w>ZxtOvMKy|I{YA z@-rL)P+eIH7k!x?)^8#=5m2t+7c2fJ)B4MTGk(0CxGijD>&l|im;JJDbIXwtl=~9$ zJ~t;^&2uYJpu{~7%n9cckYl1WuoNwP1kC{oG^xP!0(D1kgOU%q<{5sAYZq zwYD#}`h~W)$jc}sS3!R5cC|`p-E)T2{Ci4)oT)age+LcAbGhYv=sd6f>;EruWhIsS zH#G`E8J5x0iIyxlNN@S~Txr3W=)cdc{FbyK&%>(~7ymNKpO&t1j%^65?a;=fE>9pUs*NKe zYkI1RdSA$1)B#R+9pC)a{^x!aR^^i)9iM~Ed>7Rd1xHKg7u3&xk2Is83Q);slaiCA zt0rC+q8(sn7e+_(hPy)4eC(3LDS{*dE+GuG&jgj{@zVfdj*U@34gS&ibdi_lT9jp%(B5EzS%ReTclmBK(MS|O3}`wN0Gl=!vzY#0 zUMz1@mYLJ=be0+zKWA=l4h6i8Nop6?--ysTv;Xr>_EXIDV9Rg0{6Zg8eA zWg*nx*=MP(8|F8pQ80@B34{Gy!Zc8!oM=UtO+V9v{`*Ij(u#1Kv67%~Bbn-f3t^Co ziGbWfnLvI^7e)Wvr<(eSpStAt2-%v?01BNlxIOYS^Ov~p6F4NX*17w4WpGcnlhh~? z2vE2TVdh-9G0M+#Wk6&lXb1*VpIISf%z)cY1@u;}n}G{MgiK180@k8nY7{*WmHSv) zak)N*((&r&-gSbXq#yFXGzW}!!Kna|m6hPo@*Ekp48$awv(#u=_j%8<0t+(drzvWu zf}*BZxNh>6tb1%3tSzS!gK|xIPEAeaw?K$Eq7juBQE17egCf?bt-zmw&zx@?d$Ns4 zYxm@nZO>T=qo7tRa0{EhLchn547-2|Zraq_?kQc>2z*swLU9Z-?x4PBtN+e)R@A1J z7Js8~dime{zpsA%1>c7;Q+g#UE*Kq}*}{UgOc-=V5fr#K{rhaB49!ty+GUym1lRP1 zs}flWw+vYTh-m{U{e7nAd7Iw8B>wGxMFcL!X9KQFTi8l}&q^a$si}mr%@d!mAcv%3 ziZPNh(T=um)o#OX*u8$HM*%9U>EL*Z*$T92j_#jpSKw0I+IGs>atIezByQ)QTlS7G zE%ev($)B|y05QVa>3W|``FLxoZx<}OjIAu?R4M@9z6TD4vK1}0 zzbA+m>Ra`85&-FG6P^<4Vm}*IR)CS?DPDErdRsoDTY7tnDcAOk3Y7&yW_5-U*H8`6 z34g$fi!xVAIe~zHh6Qa-Fc5>40=h?2vB0V2^GYxYxNbXEAut#g%2Q@S^fC;#G^V0^ za!=~-A?NH$tDi-zSOAK%=PG3e{bkwseu+@Mk+%_q(~Pw>6&c!NO@T|h?=c7qDHA{5 zK9u1Izj1C`Ip3f*YO@B|v;64mMMK1P%(~Bq=!n~w z)RqbCGtb?#*bXDw^>6>7-MfTp_wP>AAwZ}8{v5$QwTE+V_>&n~`aY;ZMHc3Xnx1V~A(R!+L-MVCzB-yE`Rf|)-7|d& zC7pq4BIj)IJL3=#aM8}^t>5&asr=-sdm}pSzDPQ>DbC}ZzkZ7(HV+2d+~W`lEnvj} zh~RSzwH!*uoGBWqxBmeM(VW$wjRNNR)~XEX_j7A~lKLxH+fH+HjLk`q0C});kceh* zCWZtD^*$JsM$=Qh4nJSYF`nC4;%6J0q)k+m5DQwVpF16%B?d)JB3EHFhd;DTZ6RCz z?%&(IO5|DriBK78i?ZFO8_BXQ>@WIV0fPAi+^s#eziZ#LE-GYj8zz?ZfBXNU_(B%L z{Tt0s@W6hC{i$bN&`+oYrjR*Q;6xg3J#SL3DWF!ojRitfV&Dv;$N zNvHr;vXT|(UtC-lja&6i?GhRRfk;Ft8|y)Y`8r%n}0{U?SDbW#s|ta=OF1m{@<5}W?*WRgU|@eX&`S6(LD{nKQC zJ#Exw%)pz|?L6I~8R}V9_WXPo`qIV6j~K(Dn?>z0d!4ktCtLTy;WXp%_H7P}+a3 z*f1XBRVxIt3VueUfaQJA*Q~^X3`pDKXOjZMXgeh8rElDLXV20+09+OUYAXdo;NdkP z|G2NqY@oL=Gys$1(XlYg`J=h={5DrHljHp-;Zv83GoQ6}W-NAbHl2CeUaUX#L zjS!lP7DFpV;U2%-2@D#EfVF=bXWab|=0OUFZy*4!Yq(Sl$vr5T*n(!x8;1}&&GoE= zM}He(2-vY3C(T^VM3IrPU4ib$l%n`|P8J`L<^Hvt_Et7wv?XPNN)~>JMG~<49CLy3 znzYIyo?V~uDu!8I_QgkN2M8N=#6y40Riy;j>`oodAwN6qjd}LH9J$Qt{9h>lz3Hc@ z%WdDuVw^k1%&&yG!fFAZ5`Pg3sUjzpOE{$041yw43rQs%FB+ zkl3`%{K;=E_~%(m05)_FOV^@VxBLRjvn@FWXXju-%57XOJ2(0%$8VrZpeUQY(m#Dc z+6C1+iHI5DXIcR#=0J(P){jY$`Cr%8N;C~)PzAVCEJ)ynrGN2P<*iT~|2rr}xr#G? z=ru<+}q_=5v9U`u>ZSt1YdcE(qdzS(G&bj_+1T*CVIk6_AxQAlId1UN_e zei?w<8&+Of6va)LuHhw0x?a&aseA&b`%LmkavFtwc4rIq>{Ud3R9&(jk!-$kqg*a8oxh5#=$PHJ8AEPzJB=mcxowN3uRdOP;B*5 zP!$+DugmKXN4)C#9kVt`2De*y99C2RWOMu6rh4zycHDj zWhCt?r>xkpmK&Rcxc@n)Hmu7Bt9KcEGznE~Gd-gI2aKpwcgsL(!3(l<8C6TVqCju< za6LeY!hIJD=m({({n-ip#W@<8YXC{vxj`^l01rY!kj|dQd)6VEUb4ZJ-jmfbo)Rtg z2K9ygq}`(raN)WKN)3lXeF+!9spLq}0dxvtS13JT6XnmqN_f`MZLLUOr+*v_DrdLE z%k3RjGpy-!hrp+X3%Pi-6E<<5m@aY5jw)%&%9B42efft&gCk+}uaEpw7+NoThpjb(i61@f?iKUI+2b&q z0zPMqo}`xcCUkJ4RtDDLsMt2qDe{~F3?c-Gs{7?XnX7is!W(UR8z zqC8eOm8ob#Oqkrp?yl(%?{`$3&XQOdZ>fr$F&*C>TMQ)CRetAv#I;2hZR8T$+SZS5 zt?OuTl1Cr_HO3~eTvCGs8v|Vz-IDnaJ_gO~zOLf3(r@{s;y(g55d3x^Pz6+juJ){N zk(C&Z4dXgm&oVcfY4LsBz2MDU;<4)HJuzCXY2d57QOCjEUw3_FwoNeR2I1f`D3q`H zKAg0g5IxS4DkkwPh~vQX#{^Kc|Akzi%(0w1Y*UFNR)_Fm<0d=La*9OP+KhA=efkeP z#Wg~!lWJpnxuaYM-Gx9j5p0UpG(R>_@W$zlPvjIFl#v50j-Lt-i>9@)S@RKfGpA1e zvgIK?+9}i|ERdIIaBCFv1!OO0L1N9n0L)6y*Q4haalLCivFkGZF)x1z5SDi?m1rKSEfn4NrFkzbB za|e8ZWX)ej7~Uu(qA@C!VQqX>-0q*!jN4Bx-Vkh5K7yHKPj(ld(lWRPo;jP0WBbWg|-{|LAB%R8afj;)Ynz6 z5zSvF0XM*!P8LxtF15+6^nYd9A2OLCCL3^{R7@`%d*=fRfiV@Vt2Dify_+4KcbCLU z*y~-Pe2h=}55n~t8gi2=BC&us%3^B<7r_~!`O;`7>3&Vc4$o)=7u?$$ew z8eY>ZV8R0EnlLZLMKAllPj28bw(=s)?y77pLF~F%$Pw(Iy^B}GYRiM-1+-wcM3P+M z?*Qw76>%@at*Tn2m*Elr?LRayWjf09O7F9pg5IA<-Y36>Fb_t4UWN313?U#!Tf{sJ zfQ1YqJ=z}qY?-7G0tn;+ylK)%-DlHR911ZNawTMm?W_cZoF*(RgkT#mm>=K@Hor(K z@82s}Cj^yBc6(PwnEo^OSQ?*kher_r& z85XzV&_#@Xm><7VLR$$}e&^Zu@u?xTk1W1$h51LoN4k z4c!h+q=FihmC!w)oOD2oqvxdw;(l*sd&H?dqdBi$lc!qJKd(B$T_O@{sU)aviYK$_)+G@>S9USpQQ$wo z)t7dJu6EmrjBSiOPrIjjUL#!RVBtK1Ztvoa!riO;amJs?G{BqpaWuyLYB!F}dJr$8 zD-KNBh*t`>`PJd10!)RW0a8YZoz~tlh5D5f3vKZcHDjFl$>)Dac(`@9-&Q;iKbl5v z&XTwwINCPxbE9|WNV851sP)StMu$A5O!&s-AiP-5KvC-Pod=}7Tdnd9o*nFV)suP4 zUUot&Qd4W7SYePy!_;0pxclo};H1s_*aimV5in298m{8j*bZ~1miZ^3XtDGwy6-yd z5p4FuJ(>02Ya=t(#f2|I#+0WyD+a|~j}aoZrQX?-C$vZCSA3RVWiG5oxtIZzuc>gg0A zDKN@1h+Z{8K5d0el`R|a-)Mj@XGBgv$Nu^A^%esG1iouAV$jm49Wo%c6I7y;%KDauL`ngIg)rI;E+AQ4j_J;R>hk z$vx7#VrU5$vFzdRp9VhoJ&hvbh~P%&D~Dp}UfKNp!qh!4J5JkTO)V5eQ^B4qlQnWL zVRU|RpLu>IF2bZ%b$fiuLvky9Aq3FNvNo(yt>z$rkKt#26C!F)@Jfu#zQ7O!v1jX^ z9|5ViK)uIG094+peqG>>3iI6KbQVb8214!+x6y?G$=-FItDb$Pxa6{IRPZmMt&B&- zO5|WI=mbaxXG6CW}p6R6jA;n zdJlJw7NFs5z868xfv+-ROunz^7)hk5%+38RyOEfaWWyKlQg_z|D!+y%6n56bvYA|o z#u!B8<#Lj2HKg0sFi^zyas}mai)R@7g(hF-K-Tx@dy0^ zwu4;w|AY%u`o;@XaxTici_o5Ixzw!DLM%l6eKglgZttWBD+$0%O;gP(09d-;Ly?(_ z9S-;6fb%S+T`2>}4dGED>bet%h4OduCy3p#gP$iYHC;|FhW@#4K~u)I_Je$>_lMTh z@vOJ`1G6mg!o_aqrYj?gl`{l{HUp}dlIHc|wkG@xDa;1_kI-ARS}lq=Mq(v5@%mY# z)=IOOVnm$Gt-|zXKZtMP=yrXf%|h^U01fajW|y&!Cyw!3=28oh zH{kXh2;~noImywdcCM*MeOZwuhZNZnGKiX4WKn703)6susP23+Xx~9W}tr-G0fzqTYnCzw&ZwicjkIu74Fgmp6W4 zEN}Mx zMDZ`TWBUxP5z$*@_>lR&H3{zmvd;8|5U)eP4VaSTsl|QTtB? zH=h3eiRcjOl4gMmzA^soKg#qOy3?02pS==X(JY$SD&PqIAh%JI*O@NH@&-p3AbHzZ}CfsoM+EL`V*~ zN)^7vip}2NC!7nU1N{VK6d6jyO~rds!;p5tBr<2uEh5?Hty+a9*Uyw829cpn)iUPK zPw4^Us@czr0$d|GB3FsP?a_*Rmg~aQw5=;HkPbcA)#1?hGLe|djg!# ziM*t+6$g#C-M_QD$0-+Kl5r)J3DX1_acJ}iChsxx_&spSd$zB*Q5tVY0NziSFwmB_ z896?VhJIj&0^~%g=^^W{NUl(yt`!E)mEM8?AV5O&)IF&$3Y86n(4?HJI=817xbTv3 z>stasGA0=I>jsf6pYL-c@Z9*B&EclFB&PFx;lEb|mWm0N&+UyHsSBlsHDD=k{R5O> zZV`gvUes2!(Ka29qMzopgp0P|sS__jiZ?#o$X&b3iZaDjn!l%Hi`e5ti|V&h4z|oa z_w%oWJf<}KuJI&S0e@9cty2L}h_M8JQ?|7!{d3bFVExTcCI{>X;u3;>6G9ucH)TcP z*d2Qcrh#1`a8kW6Bed5B07EyIyQ8owkG1ej2VAG%2@{~O+l3N#ZtZUpV^Ut@ z(UEP~>+Pm)=*HPgP1TW#ns;TM-RS$Q2=j}!B)yzVSH{DjMdf*pVfQ_(d*hXXM?5X=q1yF^ZC4P&LY z?*svf)qFIydTQkDoH%O}#b)RZM(n&$kfMKAkpCUOoW9?zuTK@5+YlgmTt#m#Yc~eL z^I1NGoc!SiU>FkD1 z$_$gQQRTJZsuZ3u@ft(WD44wfhAU*c`|JH|^#C2_`1ks2Syt?`Y_eF`#ejXP22%bQ zWyxh9l2LSP)rDgU_Z?t(Ya>z_C`9h7BAdJoVb7sabqb+6(6gtBX;j;rU6wXeBm3C` zyWT=YyP+K@!U;rW+#${Vke<&|s45n>et~Y!XNga3{z;(V+%A$Ohh{BsN9r5J_#t-JfD&DxKbsciVSB}u7&ym0P^@TW1M=O5{p=bTD7DK+L(Ft zu_H*81VyExpU;PN40!54=zzYXK0y4Sm&HigwM^=&)Jtx}8nd~3gzpWA;_I}Eh`h5wF zgSWn6_r)N8FtF8)(_k3GmeuwSdf~GC)-ea%bo-LaCbmFm4YH&nt1y!2x2U6`T)dLd zEACn_R>0C5TR)8MPoL3Dg6V}Ju*&IPOt@~PvV_2BvB_58+R2^9k5cGn5kmWJzKBv~ z)cKvqs$I5|o9cYGjBo)kvo)^+5M}ZX1`95e_6a4S{V_Ve++uF0L*MNFU+q0p z)RFHAn7!CvQ4yr9T-KiL4&oG>MCcbqJg{IH7_UJi4mm7 z0U{&6VT6^__V8aR7%FgK)$v`>aCyp3fdzFvy;>+uLO2?1ng!v?VjFVBOBqdL516h- zfgPfQttWp`k-dF-tfg$2%{Wkfv6Ksp84%MS87051_8vsxnl}J%_%{rAS?E1&_wf3<`fO6#M%^)E%>B zq6AC?{B1oc7qF)GOnLW~@qyB5W_`gqc{}_+;#rOI-@)#)CkA+%Kp*`ycR59~xc6bV zec1+>&1eNrh&))duEXT;)Hf@xfYaE zyzg?;J{dx&OF`I}g?)8`!zg}s7Q8#Wx!Z}mUL)T(-bGI?Hn`GmJ;m+Vyd7!=BNo2a z{xPH1geu4dD{2EO6ZhRkGTS3uLX>XI`y{+oEM)t~)Q#yT&0%~6 zDnI}n$5*^eZYd~mTuVJ~1`>zB>jldTnj)80UuvrFgBZPyU1 zVXAuHP-{;BF#m;jZLVmo1B71Tuw7NOE2r51!~2G;mtWPd5>B36OgY9Z#0!?XJdTzI zm4qt&tP+Wjs_bwieLk3n;1LffR8X$ferH4sro{kY-zzlHk?Ep|TKE|%6zxlPFsT0c z=?&TEU(3e1&p@bPZH_OlVqp2|MLb`QDLerto2L5>&!glPjEncul1+}rt23OC%!HyARQYnvG2xe&k7Q(0pjVuDnOj0jQb`j^rmx8KyuX*ca43k zmP5cOC-2t}W|_M)Q$+nbx7B3&&w!0qt=>J&g!_tCu+JL@FS;BuD8IK9GOR8T7zzlM zn_v1wYDQx)Wp0a>8n~)acJPfu39n85y0fM;rES*4V(cYe!0pS|FUGl@Gx)LC!ODK9uguW)bUgSS9Id`<1{92$W$7ByH6H zJFG~?k<-qpjn}JfvnF&U;;<6$lio@eqIts<_uRFb}f6BmLVKgSet;V4nY+g^KN{D zD}t+;qrC%O+eXfR1M|0S0uMjV3j0&J@84z-0HR$=Py3q*)hTBUnd4n>PHCe6I{2Ho zfJxGs;Owdba%(wMY&a_6Sj$Vi=;}CRsQA;^LI9H4Op${qu;4d*cy0dLz0>r*amM~# zDnp)Xcy+U2Ry>V1*NyC#T>yrrf4MpH+wTnv2dQHy_$x*PD>-njAm8_jkcTATC$-Go zWi|t`{HPU!n*fZ_JmH)Lb`FA+m4YT{m0yJzG<&?;J^6cmwBSZvB@uz^3cAh^D$F7 zqXHHxAFdW6hm!^J=ZzUh$@Z2zYF3|=Vbv})W;qqETqlo5tVu{Hgm#e2;(qX%nUZ$L z_-4l20v{P*Pae_hQa!|svv3`pna$ZjrFEy7=TTt;!E20Olaq4dH;cD&gO0yMR}|@& z11+HZ>Lb7Zzvv1ryO!7ns~9`amdg2!cmXJD522#dzr<{wIyqGTIfJtj9PJ4ZRyQ4C zjg3LFCis(MNc3(@_vq)5j?vokR`0STOl_8hpl~jYl+=g#x~b)a@cLDRXUl$8_*J7Y z$~4Rp^8)BkOT_8#6ce~`majnXW7Z|Oer!&_T2*6fN|${B8a#4Am1T->LlJ*30Ydz~ za?tttQy=d9VZm$eq^Fdy{pAtgT|!YBs*o=7r;@SdGqt5r-h!2rie!?Nq;lqgMJlS!A8Ar6U>vS8sHDE97~^zr~S zqnF!>cZEm*i@i2_^F0B6_!6wa^19IL^GT+he4wRjCwkU>fz^1s%Bzqo;Q*=E*3KUuO>iZ2iP|x=S;rBZ=&7~rge<`+y6e3qODZG`yfWnZhgO5`RrCy<9Y=@ z;2K3llSaZoS_NbFZ(Fxfx#q)!E^9dDij=P;-e>*%6@#)a6$sgiw(;^;ZNE)V zm$#-lIbJ=x#@Uj&L_*irs&acPXOJA<=a)QX2-rkU^evDhc6J*5GBxVUxb6dsb+tlF z&F4LXkiMVJkHoLSXjMs+lodxQuZXSMRgZuz_xB|9(N}?B@s=yp)D*KdJX?DuWlIJa zP0#0LDweHUAs7{wWy#q6eA$WYLt~nwgcR-z6RV1OUZ*ER)u< zu8tSm_F0ne1M?oE9EPh-e%Dtl0+RP{?o;E()3%T>XHec_O=A-rgW>k1QGXpKt>*#s zcG*I$lV^HO_A7Fba=TYsSs-b~%MPvBpWJ4aRFL|OfB6jP-hLlrs#c6{FKH#(LXeY9 zA>fMB*D?1@XWyWH+77&x5c1m!*W##Zv37{0mU!#>DOgD=F-_td6#@f& z1C$zADalppS!s$%15j1kISK_RXD$o3B(KG;j;2j~q=^9U0^EDPJGn@tOV2>gO;oWnlV%HZQeeAYo&7`G3d`09`a*ZL*n{MtqmXtU8SVt=%<*WRvx}Ll65m<@ zeO0}WT8u8Th=gu7b4nP=^K8msj-iHePM-y4VC`T<9=s z(gYyJ4}qFh=5>`+zq{A(o>OpSR5o*}5!6yw`PFkpsy?uO8~VY@{(9=i5&aEoACeJ_ z824S^V=Tzx5guE@Vhc?KtfUhC+kbR#r7q!ug*h8SQ0ndE{Hd@+?Vd2s=RDXpp;eTh zSlZ79nq{XqH3cHP2P;N99WHzM{@u!X+xRKhv(oihHkbiR|4igQdqGs@@XRtonSLCeg6-Jz+h5ac-A zX9AzCO>LM0=Qx&GflCPkLvN9|Pe8uK5WE%kGW^}t6`OS$Y6T7gMFE8 zhQ=vZxe8z3z1@MbR;qr9L4CiRWD9>z~iHgY6cb&?H2i{NrT zPevon76-*(tz2<1cF@HUbG)H3KF^TNMWJ<^G8{PtQDI;0tWJsBWZ;m6r$RwS?^3D~ z%M>mblN{@I0xU=6Jniv%eoO-c-efq53v#6K{_f$Di2@a@Ic$Vgo;LB_L7R~K|5bNw#D z`(NF2l8BhzqMF8^ve!rK7AEXQb#Mw9F9JxYuJyO2FK@!K7NM!u9u}Yb zd;cq6C*d5r{`KjA4|su)3$P~0$knA)53t+=DR6xfHyx%Jb!WkDq+4^buG6}3Ig+)URk-_=17jsFy`WbJv zB?8874bznqfHwtN$KK6dL1MQ~!BuSx0p8+X9LZ@(|FNDp-vguBta0Up=svwVDpF>0T}tBxAI-9xf}J+3fH!EA(I&o2j#h91bqF zX93h|aQ_>$l!K69RAF}AUXK|)AYx<7Cle!A5_0z;J^VY zfkcQ3BatAW#R@thB4Aj7PwW8S9r*3QcLzQRq@IYBUV+^Lkyi||R3acS5+bP2Bh6uB!j%@fENy&muo3`@)ZNy$cdq-lP{%&*y!x``iWqz@f4# z;GXwc(!w@3ggmQ<~?!oo_Zr7;fT^bSh^hq=TB#(4WyxKg($H&N9$)7KABU4 zoayhg0APu40h?y*BIx}rkO~tykO}fOM?@?F(EwG`(I%)12&;zOuPB&6BG;bF1;NVx zC;v@K${dZ)gcOYEeGD3!=y(+cf~GRu6Dt20k%PxTMFw;$gO8WDsY~Q zm1Wd`5bVFprIt~c91}ERkp7{Sc?`ERPXMRAA1k+I1DeJ!;=-+awhSWRKnnL6xcu7% zGl$kF6!t73Z_Y}dV@6)7Jm=PGtG=4EjgVL=x@Rn=14s7Bed_(;Q}BCWwzt5c56zV$ zVJk`KvvQeS8K!zBhfZP2k-W9lw|Jj9*C52u5e>)@UEEP=&Xvz z;Q7N0J*$EC(%;f1uA<_cB?$ksM9!@lxp#%LDvGW{AKt3h(VtXVkxOd@gwzfZOH%2n z5Pjl9Tua;gCx{Km=`v~lDJ;+UBM_zcQzMEBE@NP3x6oQ^PNNe0+Yy% zQGmFo$Ol`?Uxt|iK=Fjd97~Kjm(Gl5uwL`$SU^~+X~HaL9tvQ(xF3XzEHdM)-+zpC zVw0}RUpk{^$=%2LigQ?eM+#s4>%Zdevj#{gj0w8tI1sj95p|DO0G+12^d|_iO+>%r zd2SE}YoY9qg-oSZgwG{>ECyrthD?1`qjEaox;S~V1>K6pJ%gUMvW>WJ+kKBUL@%^` z`xoJ{2`Xg5ei;<7(Jv19KHw;*41t29{R|k+37BYr^D6q;E&c6)Tlo!B?Uh}Nv%l)A zbuV&|fy^38iJ61!9uCnlYw(LWqlV`Se47Zf4RT?DL2_;*{P{@JOWA}!bJjmldn*+O zl{m%wc+^)Eu;c>jAxC6fL4ALKNibaw2tYuH1?%GH2mZ9n)`w8yR^i)RY?L%Kc5cJTW?rDve25(r!=H$l($(6 z_bvOgG}Bj*S-uUgSnzp)khco^cd>;P!`1!#QV09p5Dpmq_x};Na&LRjkcQ{1{xzpNNXFPU@@2g}}0=w}gt z`@F0<>$7A=N~Eb(+_{w+w#{eP-6R$%^!b5pH3qP&*ROHuOxrbwv1WH5@uTS4Z`Kd%mrj$5XIg_5&kt@HbX zRxr9?t)XZC8xt8zpARk~{h3eOz0V(njfj^J6X+i{^nGd`hs9r5H_Njs|g;32)& zn}?1%l*7L94Q-s@+&VzTgo*A4#@*St(wZdv(OHn(2A)r)*ham|>OY$SliIxkc9b_H}e_|H4 z8rcuQ#ujP{1{d~v0=^Fkz}g~JoPkkG0Ak?@Q3m|$KO=bC&K6M$NYD{jbIS(;@Nw75 z7$hzk-OZy|-O-j1LYzSJMT9CZ^-SWz!8S;luT!8M$>;;PK1DzW+#9g;u8Yuew~)!a zKUK$9DteZyS=B65+7L-dcDM5@_O_i9`?cOPl$VsgWI#k)F^t=ezYrmUTt~;}7Yc+u z9zQ@W%#m9wg3mn|? zS=9HKNg2?TD|_DOt>Q%WiurH<54R{!g8*Zcg{)}&b|*rC`%($F`vaT>h{-1ioChOt zQ{fZ`sJ<*Gh$`>jTlH;+mTKm+^*uqa$xSg}YRtz2lu=%u@n=~tBaBG;9|{qajKHuz zSs?9zQx#WT?wJtuvv93~`ih?~G(=^NE*WhxNs=eS9)W>!j|;SRpKPeO;aURqvffrk z75Hotg+?*iau1DPfh=oE_xv4jI+_9kCM51B!R^|Le-e=P9>PkD18h4C`UOdbVS1n# zNJLmu(BcfXr_NhJ7)}v%{J1u*O$x|yqHKYyh?`?V7a{koQp+Iq!)dzZ^X)B{1~EDj zlk`T0YPFyoL@utlh*_On5tg|nP@BG}Nm|H~Da>=ZPgbyK#O=P)7GJe%0?O$3*ThTr z*?j&wMxOBxd6n&smbqQ^E#&z$W4gk+1b!(!ZLb3V-FW~`L~Gqi$bDAR3%N#CodxA|*M-I^YYopMC5+wmbT@}`!zh}e3L?M~q5sP3?y?eq$Ie#C! zhV?c$_`C(1LPEx88KT!cKO%141N=VBzY6?&ue3EOh5)y>2mg+dzI;o%Zq>E+1hfV! zx7#F|PrWnnoD%=1zcJ}8!kdFc6FpW)m&nv)vf4OtK2p1y#q%MD;Loxg)kaU}bRpsE zN?_>oBDVvkz^fYgG0shT`G?51)9rmglGfpzy2cx5Re&y_E|;bK?&~Ngbiv$4T;&B8 z+c&4V8U!JtTa`E9w>lA~gIK;ynxA*8S6C;2?D*Yv?`8ilEn*M^?^`E-PDx;VfBhj) z0HD!8=kz+_0G`+-u3nCSY#`b>W2ASnv#_3~!H!o%4&^XzHzz4huvovTMT}NmAu-B%vDP57EEoW!J=C-_jiKwD7)Yi3Vc2jEL;Uz#)mWfE1$UrI$+c89re z=;ZNR=}0(*{1n);fQtZ%Azg#-##`)wndYo2eLto5qEw}#Jk&zRS2rL;Uf$#uzMX_U z#NvS=Nzkw<&S*j=80f%%@$4*EP2jgbECWVJmKZJYZvb9Gpa={)u}M@8V}+wU(Dki; zs0o9opg}pcVMoqguY)ne+F^P#Y1PXgoql#pRvUVlpn=fz z@;CW|@%l`GRaHSCl2!=K>;1_O2iUIZ%u6M)GZlk-#+sjeOl;T*Z6yaYg zWqLu?R+K-x@Kj{+fw7O^S@+F{BRJ5;5PTVno&B|}r_QEO-YTlRCI0#F7WFXCTP44P zAS!NlFx%MOYLsaL-<>Z8OowR*$%y3Zl&TzC@>J^k!5xxuDiqHna|aySKfbxt;ESM* zkQ?^KL?O3UC)@_G13K`MuL{}EW?u$Ski$Bgu8+ny&f!q#+3D6+N+Ym_3K(uhigtIk z#j>_wHzx=Jk18K>zek17Tt)PmHhI~Rn`?c zy@ZJk3Dl)5-LgO406wE1H2dHFf44lH(k>NnM7i=7YAbb30nvL>_ld2vIjNN4ngh_V z>_W6>G;F&F>-MNd&;SSTxx#ZJEZ74jAYjVc%8DTa1Sb;|4UJ+_9CA*78X*G{imd68 z1t7gHYo%z3y0kz;n4`aEND2dn8us3=UHq~Q;XPLEl!ChM5FwQ163=2$%#vmZ6t@g+ z%B*=_#xlieM6iWr8JGh&&Fu}t!oqp1+wPAZL#s_W!ZK$LR-~6&VznKu`;1lJ^PWX* zt>}DqQ{0@PYtOswEYI=I|BiU1(pGWy^bhf7L z2EHwXydAojZHxd7xx_CXRGONGfjuslV^X*kkm0tM(p%imgexgdqy+nHlIF63L7@;FS2`?*8GB2EBZcv(QY6xVavvxG5Dj!S+Ec`|YwgvFbHk;P zf>I?Pza274;0;m%G?_cycCa6V3oaM;$YjU?g031cZSFxdBDgn~_YIX=d^i5+6nU_6 zplnxo@(4@b=n{c}Y!SI&<~gp0&1Q6p%+#(rHYaB(w{+225clK(ij@wdYoeDLszj4$ z^fb@xjh;5cC>Q0!F)DIADz1f&FPu*u#>fyXeTfJ1o5m_3 zDRojyDwDd!tuLeU9}!QGcb{i(y=GWvG;m!3BOyWOWWlAAFO?;+WOmQ5!8>cD{Ni5h z%x2wjZiu7Ag-CdUCx1lVQDh4OvAAjAs6VV+X(-u>{kUo6i8gK6EByJ(m4#S;h z8FvP##u~epR5MW#@pVUrZWZWs;#`|s-l*0kz2 zfham^JSxUpv&g4U3odQdC$nt-;V0yi z>XB`rsVBg4R-h>;w;{!AIvZT5#xVmst#-Ys^q`KAFt=2ovHC?f2SJJu-p@DYWd7Z*whCWf^${Dkl5Xu-bm2W zdAB|Q@rWf0+INc;pPF6i>h3P$IQ6=yb=7{UtJ`K76>+tvgejxzur9(24A+41gu9@P zgZdEllSr$dX!|T8?EJ0PywKlNg0eZ^1&-$L*5U79h{NJoEdziF7{)8Q5&NJ1Zy`tH zzTD(nInWks2@R3;7n06m1&4a1TGgcSYY-dlmbEfXBHl%oY^jU)UTZS@*Y*<*(aoja zB#l2pcQA|6n}&jRA=oJpb!)~5(o1X`oCxlfqL7=7=V*9zpqL)Gg63z{ImQ0#-=$6I zM@as`mIx9lcJX$Bee3G8*k!0R`RBCRk8wQs;!Fgk_EpIJ?o!(dB;2b0ORwWY5pfnr z1&(G^yH6au3w%k^wk)Llsu(;0z&hNVZ0eAMrc9jDK1fHVo0J zDNSY0Dj=9S`E@GCOCYELf4@^?tgX~?j;&|~O)Z4bZAR;3Ffir1fjHYl86OM6Lt@ww7B|<{1K&+~@tvRD6c1dxq|Ri>+I8 z=+v1Ut&+OP5`XuIc&Ze3CQ%xb#Kg|P<3myw_6XGOP^JXH>}zMSPZ|m3UgboLIoc@WjcNzUkOQZAOBVcwk{I9Ea=i1=f}qY zUSqRW$G<}iMc7(EB|$|pH8`~t!n^?t0(RQT8vF4u5Fu|5Eh#}R(HW5@8>p>y?QP1Rxm9-d{ z6dZ7Gws8KrQPXv+Yb^x=StNV8We*;qTbt&vdVRtWF|{0*msk}rJVLx&G6lC(!X>`! z#FJ!w?+5jB6#4MK>*tmUyd|Vi zFG3ZJ?J1e>83Izw)btf;Xe$76Y@~LJ5nW45_jxr3$aGLELlM1VkTA7m{xAO-7yP-U zhURnC6L*4I^ev9K>Nh#iggpYXIWPn>)NaA8-LlN`ub{)v<(cGpN$#_kB?saGJE5Nw?N zUHOfkn#N9Vfhbe2X#UotY|`4}WqTU2L=DU`mtjycmmHEP10J1Wai<^P%CSnp@uZcb>ZMGIt5^BKmzE&nQ3hO z`r`d38xTZX$iK4->$dbGZVAy}K9fE9!9d2mm9xj5hnN01>2<%Q?^AF;w-EpM${VSiID^l~5s8$sD}BOD3yEbU6|na(Sox5A0cem|MBa4kc~wLFs_myc?J z#d%K*iMEEUZ{B#*SNdT?S7@UGbZ#N@{IF3G2A-jF-eP@QZ%5il8HQQ_pn_3v2qD3? zO##ri5w#clEGoqJz3!~GR_L9xXZN#7;TEhOivOqoMcivJ^e09tUOZ*;PfWpJ6xs>} zXP*VgFbQh4B7}`VPl3PojZ5@!3CbMZ_%92fC3d&O2JXfpV%ZSvJH%P#oz^Umd$XRS7_nDkMSHHOqOCkvY z$gCs@f6}TaH1M&RrA(~K(wbYYdW$xJx*fis)C;D<^j~v9rN`$b@BPX$3gM!W;Ryn+ zD~<1GPj(?1IRn=(#~A|sEhR2}f6w{6EnK0d5=Bl}tMqP})FO=8xBU1P0}`W%Zxgj7 zHfKX(ddXGls)8WRNyMNgB=a+vpv&T_E~Ml)sxv-sY8z#~<(h-g+elzhQ(GL0mQ6yu zOf3=qQ1g*wue9#>%T?`XnKbeRShQ9z`|ha+$y*ZaLS zLQT-q$)43usus3xEh>S?;3s3!2&7nVYqfko9Exzt4BsZ`JvKg5xfH3t$912L6>y#) zu>Wq2;1m@2nfpW;&^=vy3#jlOi#KKI3MAHl(+gbo_U6IJT4=dVCBM9i?J^Uf207tM zBeTGNUbrn_8PVb>E+hTpV#ZmtG0#N`SoQ|#?Fq#Quj&fOa}jDu3Y%0UehH;+PK^G_ z4=+%3WRQ9bk>Ns*Zd+KFwk*tZg|_P+jJP4OI$4I{Rw9Kk_!a;Qu5*2M9(p_Zwf>7# zlz;(J{5E($9Q35q^PUwA0l;$qt~3!MIl}iz8yb;UE^JB;XazX?YgHH^y-2@BcXk`K zNXo9CF@>^L+z_=|as_N5zoj%y)Mz8*7h)hnSDZ8QeqfhTULdS#n>A%JJ(|XEnR$tJ zdb5(-+%}YK?K3FHtfkP$Oj;tQk?% zBJMIqQC2jCL7Vv3e|p}}nyy&1Y8>X6F+qg=mSw(2To#s$`<~3zc3M~v31>0Nc1q_P#{Tc87?*GBstD?_e!H=g>2`?I^oc^AX!T{`H z=wa0|)Dn7+p($AV(}pCKC8p*8n`e@q3WO;a=0Mk-K}#=dWbjPf^?fd=di_ZAeN~)yFUVf8j_did%{Fp zbiK{@F={O%>#D<1eep~g z`)~=L?^!Swe!Rx}um8OO{)!-1SdoCN07keyjRZ8>zhqRcdk(?{0lGFQ_dQGOxw53G z7yL)JKhD-{*cV=j1mTJ2p0UKoJ{?`AJ>#w5#o0WSggy%K; z%Akzs-1KTn4z;XJkmpr2#h0o%Ly)YkOU0oX1wDJf7f;M((dP4N(9|M9o*QzX4bGc1 zAS2TF{OdnYUt%BZb9=%w*t3iaNGXKD3JRia^YtcPrHnbDuMxF^-G@r>g@JBS5DEW*vAzvrojBVnxA+B|2e;fsWPy& z0v#8~EhSpiqTUX;^0{Qah1yXOrsbr7?W@k>x#d+47ALMmCi_p#yX~3|W63#-&;@w& z4EnO??@uw^AXX7pL@Z99Rsuwvd|ho+W;F z7yh15*?YEqz9=~zY)@_FXDx7LvA?9-3Y{=TAR7@#-3}}vQJmw&V|}%%!2gs&?OE2< zpvv{ze^;^%r6Nx+O4-BFK5tk7r+|Gf6cn#ufYG*@T7*V9pVjxXTPYYRm&0sx{`G&| zzVJTFP#Q%vlJi&n*CfrY-rL~qIY!FI18i|C(q-kuTyKH!^KC~=S=*bk_PHbcw5Q_omV0rWx^UB&vlVSf z6yA2{KlyLc-v+koB@`~eevePW4ed@uVR8%`_}`e%zpHR}P;T7<`=jF-EJJs$mFMfGPS+<(|xOnQoi) zo-2}@-xpe*0-^5#%$11j*%){>C@g~X)~arZLg3$=t{OkVtLz@nNT|7d8mQ%d!Nts(<7|p-_*A1?qGIz)a zgMg_kP6--6iKb(dBXMik)AM}A%9W-A*PfRRZbTZXk+pqM?!>I&*^y6ZM2Z>r5?fO) zA)5VM_NyLl(@1;Im`o?pmK8@9`zJ0xW9|X>TdXis;*hFcs+i8!DrKtT!rS8@y5t_qog& z<(Hxknnu7ls#_4l>ERYHpTQmkG~2c~!2vYT=7)07w-yo%^!{Ksfnci_fuFd({9NW| zFxLtjvXDJDJ&Ly9&j1CxN6fOEo&xdnw`mt{%UHSy2Ih7x>h@JRvV9n!Zf|2MK+GB( z?Yfz*A62(TR3hOgb*DH?f7=T3OxxQC!)+U9&ip7t!g4VfP^c{G3bm}gk0A~USaQ>H z-*hsBIGy|lM2U1paAC_XeOnf?6$7=R1*R^;z(v6!v9M^Dtoe8qSb@0Pv{l%)rCYx% z0Rn6Tz31uthrLOBKi3ra>%;5wsNX0+@LcF$|Lx1)Q!%X8b&qawiQr>oan|g&e|ynx zAA`5v=bofu=`CzAa3uyIYTS!5Eqb2qWeA3btrM5&fQYH2<$D6l5Rmrp>>Mu{q;*;4 z4Cq)IO_4y`LfI1kv;vx4|T}nq%b* zmr+)gQF_UggC&`*cH}g0LF)H=SmD^v`&_cHfX{7{nT||iS4zb~dcy@ZXWR+3kHg=7 z#P<^$6U|wZ0!BP@&y*ZQ(52Q@FC{LK z-r6NYV5MNdoqt1leuAE!MyOODrJp{Xl=uptJs4_uY=q_taZu046%P^?d$rlqDNp z0nmL6WS)7xQhm?Rj5Mtw@1G5(u;kUxg~0j*+%zh;5xM7w!+1+3LSVWa7RC}r2+zVv zDI7Axv*#s(Gqj(Yg^V;fl^QVVW!=Uq&`4KNaxP^nI2XVZn_QN8PtBmtG6Zba@aGgd z=RQIGJzLujj`kvZp zK>Q6@bcNB4kRKPT$u>Y*Ot{2?Vo0v0Lfj`le--Q@;$pC+VHgFnR)#6Sj{_yYG{sLL zY9Rxn%Sl5Qzf_1I_W;Ss+cla8w+%ug)Ft=1h{(+OIZ{kCW;+P2O0ZAvKL8?3P$N_I zDiQC=c0g@y{;`Yd7DA`*Cs=4HC(Ty?5%wZ=+e%W1e=4csbAP)Km+!-ntBvN_iopP` zwr&vi3Ud;dNgBA#uohcM^+%O?Vwfzmeim3j8tT7x1*1^eLI|I$2!mR=YGq3necS9! zD6VKY;`->Ha;J%pdyRq}X#s_ybp{AP{caZQw;_Y~b2&QqcP^USTg9)!)X)9NNOsr> z0qQHQOc^QsI}i&*dEUbIdmpX6bs+5_dQAux{hNpNhPKGb=!TwJk5blK!EbqV(Psnk z`5Hua>?QX%w4~ad9D&}N6fhldkvzB^D^;H?!gm`=Du9j2IdR}vRmso*HuW4_*Y5&s zyrKMy0T5OUqe@5_s$+3Q)~>4i1p^8i^$u8P65IZX%4ARTbNr(xp|QWAI)#@mb8A9g ztqUmB15JlSJ@P!gjSZxW{&~lqI|4;8mL;WD^7k@7McR^6ePaJ@FG9=BM3wns|$IUPwjgnPu4)( zWRd#r;m{|jkQ0m8Ciu#Y^$#;q`HMzHR}p0$q+)9e@o%k1Yhvy`i*PG@gL6@HcT+s8S#zWLc8qjZ|1f-ITr@pQ0Dt2A`=tNgW;#R3uO1j?@dUZ_5JsH zDFH$k^5oI?e?|7fo$BH%K*5hZ53x)x1CO`;ITX13@mxj3RYilMiyfpb+OA5KgxcLM zJzIj1JZW6Rw-bhVQ~n=3wIVB&<;V=8^D6_Zkfr(<`X~01hQvpFrc@x4s#;EMIc0Yo=VK3$ zS$VIi=VxyIDod9f`fHXn$rLjqjTy5=yTR%m&MvI+@S@-l#H8k&O)5!y;$yf8NmB^<&%wecAebR*HI;X3M#O76IT^#k}MoE zsNC7Ts*>X_z(4MlbAy23Ztb6gle_+s8NvqZ=SX%?iJZkud>LKd($EWV$lU-R+(JgE z)u)g*=?Uq+V#%GxfnYT1?C+JW^yYXeKqP(*(}SGVgX=L^Ii{?Q+rMJ-HIwSEX22og zoxDlSNc++gVJDnx#ETS%YA^tK?#0ISDL`DU3b*hN;=;wcvjoy#n{~=^ ze6rnYZ;-*39m)|w6AzGk|L!*U{J3FGtvH6KZj`l} z|F^6;g~ali2j2=(gii#do`OF0C`fGOBm|{l6VJLGUgR=(otjVz8ZfV0@YffcpAbAt zlLa669|ZD094I>eZpm2|xPHxyF=ku~(;xgbZoiwH%ps33YNae0I7Cxv$wZCy#N$r{iInh0@LKH6u;VG$_ zZmNZl%^PYB+D4d7=a`>qk@qnKLij?-OsO3tJuHneg;*EC4t3~~OB@Q-n;$`fX1(`M zQ~ZfNQ;i39y!2C_EA7Wy`_k|?Uu^a1=p*|K>y|CVoeLT+vTy$NUmJ+>+j|0<{N0fx z3l6mHzU=9Md=IRpLkRU{An+e8d8^1>;+cPrtSwU0NF@7=dxNDwg_ccaD;;gwOmA38 z7h?lw{zhDcZg)U@KCp)etHaAktwqVP){YtLnE#RPH7{P~hGEVR$p;FI@B zYAAC-1QaLdJqgX)+V>#{`!mu&T=$ggeFAc5@h#R8spLM1y!9Xd&!=Q!SO&@_e@B!2 zWD_mF-Y#W^Mn!Td3KISC_If^&^0NzIm$W4_N?TxcF|wJ%kiObZN=d9FZc z&7Y3}F#nK(+SKG3`uW)vA<8)uF3YmsI+N8IZcqJlmJVCd3K2J8{7iWZ+lcU@s%CnQ z>+%fF-`U7znLQ}a`6r4FqF!}^eLmlBqi#<~B!pQ=Z}vp%=dHSs5dkh`3N)B4gVSJ@KKnVla#!M(O#e8#_e$Q=e2fwL(JwI1FI5W*$Rs1_+9+yo~e6_Y)c|ZD!^~DDZ^rbsj1(<;GgI$HyN?GLZTff?)-`T9P zZvTgD8i+H{0<~wLxUT%H8*_$8Z))UYYHGA8YUsYKd;Dw=*vBkyTg;%4shY|@`wx%Z z5V8mlPWJC|QFYrt+hE&@)tny&X>ZC~wLtzu>*wv*#|o2BgmZ@B4-1nQ#d)TnOKU5< zXWP(#I1@ZyDyzaWi2vSn(g|r3f#Nne$Egivrso;VGcMZY0s_5f+pE;dz0eAZZhQWI zGRO=`n>9Jt562&laLZ^eXhBnxhd;5u&lsdKCHMH>_Fj~OYqq@&}32_iKkV<8fM%WNA>J*-x!j9+T z^>2`8->>3u#V9*p0M>25W-o3O;ZIT3oF#t$rS*0;Kth8)_+|Md$7c%zJX|ASlfs!% z#xiC=Y-l__h=G2pLdU2zjAR851WIIH8@G!hvEciZ!%Cg@XMH=;C+R=P(B++ORiWIm^ zo&IBlh9njVFH22=XWT0HSrbS>#*uf$;f+y>soqBNYAYbf1;ug#6os#T zIQywF9B!vHFhWlFjd7#bB5P0K4CY?C1oP*lfWJe}M{_JCmI!8wWna3V8rPSrpJU67 za{%E7oh{gY7jzdd!hZPT8k;j!j}N%v(}R%LamX(hYs&14WVQ-f=dsrSyZCkUPv%um zdc!^-!Gf8TiUM8h#V-nS9E+=0w?;p;TyH2WM;G;TAs2dS?Cx3g{o;DmQVc2Jxg}b< zhOZdiAO^e@Hr}F|!JiJ>qi)rg_}71duvWR!&*r($s}3)LFl~@q%d_7TWtC%O)$fxz z93}jXz#^hwA6bs<&r9)k0`!1$0xT*)xw7JFYE@jeH;kBiMPgNUtqfrg%;c9UJ%iK$ z3L4|SERZGx8gPNZR;I0KTB{I-XFZ@*V-XG- zk4KS@K3m(eAi@U1_r7kiAIrRg&V5VkKoR1zUiu zq++7sq)g^*{cOxoSmWnmQLi6i7ahA{*%h<5KUFIbk<*!z91ju*F3wpYifffJOj`bh z(U*I;0L1(|TXu$L@fM=hR^J^(?y881SDgLJnnmmdk8pN=J_O@(LSQ{OYZWrU745b` z1p6E&42Np(Km+chF8>KwH9ih5B}X*Nbjk48KV%W=rs^&u>=?bRz}_A2A|N*7N@}AI z23BeiU}Nx4Ec(jpu8z4J`#b~+!?0Wq*C&~JwbBoTx;JCGnSYH z`_jetC96QU9O=biS!TAY*fx?MoAg^tPvZpwNrkIgqfX|tMQ0_na~$qW9kf z6>{@Ot+Eg~f=NIAW|KwllKM7(Q;Q#BkOjxnjt&1LaYdYR)>2ob=8BoT#q|~4)W-Vz zeO1I3_(_JmOsiPCkW7?Gj8js~l2dKyx;{CX1_d$_eM&h*2^K2hcGq6i7UcuR)rSai zUB{>Vkq4ExiSH(yvK?zh@F3w<*}$uea55XfXG9QC4p{Jpr~C_)LsMYh49do-l|om6 zy#rmEE2V!C2E5dF+^?XZq|#@1t%7_Dr3wsWhs;ADm2QHj%U@6rX6>oJM)nu$L#$iE z?|@G9)iBcXt64TAj#+^~3(N}6_y;x4ZMl+=ysIfg6>5RVBp4O zKbdn}T&{f_9dg1X;-T&{7-}-Zv$Tfg6a=(zvf4kbxX!Ig`Pn+uB;T{My|XocB{Vlq z3p?z>N-u>IT#+PdV?vsQ#r-TE9sJc-f7g8t^Ps|IFZ7t-KF`eo0X>{J?MbvftvVOy z`t4%Y49|VWqg&Smn1mOTH<|n?5E$h*9&uqfRj!Qj!E+ll12ZStP#p6Z&Y3+(#rY~$ z$omR7e+DkB+7R^`OgxYN;XVE9y?|ER-+zI~mU+qbf zXhFi_oXF$m^JQQWF#9cSQ0^E^Emk14zXn``+TN}06+i^Nv0NjbUDfP?Q4YPOdjf>| z7Ewmv&OV;j`GNFdMck|7IjEuZr4zHou?$0I!=1%ww~- zTI`>vm-RhA1wnIpf~^TeO`%@lXDP6D{v7v~HN{m;0$1;Gupy zE|1-K2_l3)FOKzhuxEG%K+>b$_v19ZXGLcz1L@G*iqQaW|JuefENs>H=ypPIEh;hN zGd+F`{zAgHXldz6eGBN}R{#AuwO9&~=KYu{Ccj3pg7|z@A>kCJ`h^=z|COPj7u%C( z%J(7UZLI1RR=i5P?0~Uw7H~U=wtC7*v}voBbjZ{=&nh$^fIxl9rIk^5Ym2>mOgPV< zm>Qm!oCY+K?8N1B*@l0BoOLzF5PT2hg^J1Q7g49bUj-Ex#)34zooS_Ya|*iWZG&(8`&U&xRz}U?a-jea-FcyOfQU9a`|EFMB4u!OA1A$K7{}>9u`FDHoC7c0(5v7iLKq4&6=7L4$Z%@`x~3{l0W|IjqD0&+ zmjwtb%K=G*Ju(u7gkAV0Ak)#UaLRPAe;l@9`&=pMojp z|3L3Jr7vBnTs<2MrH}>}=TadtJXg|n8RshPmHjNE^tVR|BX^)5jp*#>HK>a}`> z0$s9Y>!(*p7rj7TK<$9*jxhS^5xNDRby@ZId7rCD8ozfB$oZ)VX40mAgh8Al zR&&;pYn3py;_pqqDFmU>xI829avy`lDm}l1wiIDd4p1g+zoHmt0x@HoVzMCRIPbQ1gWA?d9n26 z0FjDNu1XBPwbBEkBWjxjZsB=99MFy|s=%wyt$~FEBA#lz4K|Zfa8w(fa70=?lI9s1#TSX_XfuI;%j{TWK| zXRw;4b%A_DCzCL3>9chn{E^J<3IWrPnt;~~1oQ~}fnqmZN87RU=&aYf8zM5PU=b{_ zi`>5yNrQgjfYop4tc5+mv~4kfINVqV`>QKb=u@cpU0M%UwN-`ZB$pJ8 zCL*duTtaycF%VxH%ePjVJZs$Z>O$HudU`QfxhWZ z<*u8~P<}6p<}+%SXGsmgQ}x|l77MlSs)AUgo1ph5*t?0kxO8#Lk*aYVFfh7*RgKmWHR$rKWnVR&eLcTl~_l;yG5fl0UTzRU``z%^SJd;keZp6r7Rjb2z?Qy_< za1DadE-VakR_}^aQ}20+f3i$Ll;Qe`G?@|;V4}BPKHjs9i)D&;J@d32G+hp0`8 zTihfPZ+`bVJgCvacX4R+Gj|JDB%a7W^~ycor?`V`kjGX*91r;7{C0_|;UC)soFHf~ zN(-`{xyYA*kURRqwC#_pI=(ZbChRI3_bkayeLt0teW=`heN0PY3j<$n5q80(SPhEJX>o_?{Yq1hT6^Q0WVq zOkpoj=goHNMIN3N_(oeR5sMBR>vB<3RxG86-e#(fRXH?1w*c7uHb^hXiGKmec4Kl; zLb=)Ffy@~ga&`;-Zmdc@i}j*Ea}|;f-epl;Ry9d54g-x#xp7trKE)vZ$5mSz;_6A= zLeUNkHi)96H#P~K{3kA-#6teOSEyJ=i^7EpocmWsKMB21@D{n22djVmM`G09uD5^2 zI-8m{Q&Rz>_jx~S(Foc^P+6JvN>hc<+?;j2pDio*n6OL8{o4qHk(zUX&QSggqA>LI zxrjmwVak}}2sR$xUbh+Dn(rin2Jqpx$<_W81GrW;#pnv?HMF9Omw8s3Gt3?%J?VMg z+O~j%Kg|+mMc?y&uJkrzRZN3PrHp>j2_ah+cvI0zkvSFC5-`%ZB10_QSbSdGMj6zY z1jt%y2}j!dgc?PQeoOS(?KRO^gl?E~Jby1B)KXi!Z3h0kSX7=r6cEyA?;eQv+!C$; zz@^m{A{w3Gs`eO`+1F?JJQ5jB{XhP*`SpI{?R9ZMtu!YLrhuRfvmB;E12x!!C2y}A zH3|b{>L2IO@4%!3Ov_xR%xx~F>1a`dQn>W9Kk@B;7F7N5S-6KkkW%kw&f4F0aSXKB227gAh5?>EMH%a2zsRH^>N`u?;@vii^wXe9n^iQC`4eE11(Q{SMbuxf?mSAWoswc>Yf-Fgq^C|+>=R=0R5TcVH)ttx6W!X zKy~p@&sJVwkw^Ui_!&nHnl}zl6n-iw&ef!N#9d~wK>VjRniBu@U)*{>)64v1D?h_C zObRZK;Imx(?I4jthDrVSg>Jbp$Jve{O#AqULp$M;)UCC26`c2z-OAa2a465j1guC4 z7eMH`fhCIJCWZ8xTVZZ;Mt{M5HQcngrzI~19Ipvq-A#QlN#4RDDSar6vCnwo)fn zUn_8H=yqRO0g>YNx8V@F2PT6UWs-A-=&SVG;ovMSm$$>`^IYPG_xcJe#*M5SvYfcK zNGn=$nt^MXGl5UQ+$hPEz?@=_Ma-Vh6Od#Z~S`h%JFIk~!i>|FO?pJ4iPd@Zpn|m&y zIZ2rar;YUV4Tx4ujA-uXdxGRF+~eA^1Z*vrr4h|c&)=k`YM15Sf2x*mv7K`XY**d_ zfY4?C?bMIJzCQ}SXR${s2WL>XjY_|I|8k?g4UHz;rbhCj7n;9U3ga5|7Ckr#lL-A= z%`FSt$;zcmUA9IM7F*aK@I6l276xF4=JvW$TkrGlt;LpU9;xf;du*kItG@;IBqhBI zoYR(Q`hZ21eHIxG>oD=Etu-5X@mkxE_JGx@mU5hK)WD=u%-_0EuF&2v*$b<)SG@bP(ENY(5KAO@Hvgu*9Jzvg1eA+@`_I#*$IY$U`87_(BD8tD8wT|D3< zJ)+Ob{>vf&4&DbvOtGq1fr|gwnzn;UEvj2T!^b>ZO&*vCx46KmwHrooSltg5y<9Q{ zI=G4Mkm=9)Xiv5Zdf2y?Lo@m^!l&mGpi1e&ej?I>hdU1l5R*+!aR~m5${)Oa+xR2X za{p2i-F0@zvz{=#NJpQ$r#3gGy*-ix2e^?=I7(jy;gl(#Dr$#3oN;YaPmQ?g<;S*26#^fhE4zcWrcZs z(q#qW=ri|Cdc6nUkKzZSt)r- zUQWGxcF8~P7pUN;TcyUsH#IH<)Gm`XknPrfY8(=+Q;SB58(1KR$0rrx;uRQ*AQ%^- zw2Caz+?sg+OVT<_5pzvq%da*QWbr**Cgl>m5!(6=+lAV%kq=L2KkEPiz|?8c@h(D#dei7@`&>7$D%NhUw!6eOpu01P+}kACROSQJvi&kfUJv za%Jr05fLg|o5|M)T=PKSh;LD`9d`PRcf|Y_!2{G0+th{nrF;#he<`(W^0~#*OTOJJ zfu0nX4IbX&N#qyhms<-`3{GMH^&e{8Nq@T#464>aC(x9s)VjRwbCqzPVc0Kq&uWGC zGM(6Jyl2VCsgN+}^PYH4y=TrQDMZ_^(ie(CH-9RF{dh?vWh)ppfi+47h?G(HM}#$K z3(IXf;LlmIOf74v$S_<;NPq!ixi%;JxeVL+2|mRu_btMS=&0&4sFTeaM} zPvXu>|BL_0x>p$;SWDG#gH(MF;)2=cB|$6|Y|vIE%ZfXT(pTr|J@S3*Ck}~v!y>*V zSK`K20J?5X1MVe9QJE`_Ak*~&NU}AZvQ_W%`CBW$LZ@TuHnv<#Ax3S)rJMGdb}BS6 z4ev9(bs-|5MT5f1KXFDQgTIpY$8SLhxyK?1tIpT{+;!i>m(>@g>keH34Uk+UsW}k> z)Qe+BU@Zfa<-u`SG17m3x55<##Q2Sz!uCD2OGuX%xo5$|tx3C(gkfsS7^U`^Wm}i+ zNcZ;>L}3sK*}pGRfkDA80#hQtgp3po0?>VU!^T8#!KGUX%IO)YJY^PdEd!-vf=Sp* za}-9j><;>pqk6v{kd6;xDNv?e(+P)K1yKws40VZpfS6=@;|Lt3FBze3Sai6Duj)M- zl6)m|dr<`cO+_fUIF2RWsFYZj4xBf&IKzujen!>4?fHaa)lKCmHhoq_p=k?+Nyxfq z?2jRGq$;K^DfV`wNa@8tPNCzix%7ibsy@m)(1UeA^;W*9t62{!!WIrLnJDFpdY(vt z%*h6AyTc%TK!u^5 z=`%maD)Ux&^9mii=pS2pdwUHE@Yy)+AuFWBwMleyJsp#;ZuB8+U`F*d)-r=M+NotW z=on(QqpF+(Gzzd9<+3j6vE}y}@V%`1K&LPfH=k^(knIk#Mz!*&Uqd{dej(3f3uG}v352}HymT(nky9npaLrhYO zDi$8eu1^;Cad2@GI9ZCYM(Zkui&i1=I3qZ4HQyI4C8i_-7~^8$D8Z<6P(T}+!k(Zt z{kpq-s`67$kXwVRfmcR@VX02X0EZzFqc_WU*ebE~xGUu>o|ahzZ*vNZEG-S0QNi>6 z>pwCaQ3J5%EI{VxTl%G_1C6%G0P`DfK-uVjXjv*zT$Epts6A`03M0&MfF8KQ5P^GT z5`9(gF@hg%nWIeKLdx0p!7UB(e4o#UuZ-g;#3)K#q7sV;I4F&Td7K|W(OZ5zwI~=p z(0+`!onB$ulrEHowCxCWIqEf7WyYZ4XDe_@dq1}LKA9(gN1r>*!I@`szU}G0F38MO zQUbVKBM}l^X%A4B?N&zcCdec&!#a{qSbFeCvuW~uDIk?XUj6EG5t3(o-iGE}5Vh#Ht^D#1ywpFjQ<+;bIqfj@RVCzn zhW9Zn@3X~wbRoB~3TT$yyq|A#dI%Xf6S$1uR80~N$n-^HSG6RVZDRYSoQmrwFm3_F z7L9hYB?6b^Z-lgE*A_<%%#h@5gFx~t7lj(Y&4Hj8fiyIZZK_;P$!-061vcSS>$YPu z0fQ&NH3WQ*p4wF`nUUKi0z!X>3vz_Yn{$W_)^K?ZZw)2-su`<8^f(XJ(|#30xy^|5 z4)5Uzs3NnKzrt*8zxScCmdB}sa9A4BAQGH}qP8yvpoCk6&#yL6!IbfEj}QD0fZSaM z=|T@rc;Nzo-<2Nq6f4@7>b=&mQ)lN{#Zx^h=EpVh^3TFVu_+&_w7{h@32C+3RWbD* zSbh06N5;z@>bR=jE;uHg|fmti%2AYA)V4RcE;j zKYgoV#&1y&OnYIfcCAa;<>2u5HJhkc)8-s4)F$x+RbuR{8j_*{O|RRlF@7AlP%KLs znRid@@L2}_4pgW)XulvxiGRQfEkdY03@4>oT{6L+3WiAWy;}D0&UaDLq%k2K8*vzf zZG9fMUzWHj^02zbwFd~^X7b|P4y=itY(HUaSQ>Z6?_XXs!c!?r=P&ph-A(k0lRb?PiH>)?eJCY@)^Sd983D+~Db3vg9@n=3 zUGk-YiqQgdRb=G|*GMrdNyx3u-!XH0)Q*yk!x&MkvUiddim%X=_QL$B~NbMO{Z*p&nr z>CL$XV2jw$X>9f4R?fE;AK$`$-rn|}O<4SWpxVg5Kl9d^+GElPK>zBBAFL&wzq1P5 z4h2_s`&DrT4eQpA;zq2ugPd0@nSjBvD~f#2`@qzktO9OlMT7O%Ca9Vq08zgzVwQrl zDnD-o^-XGtdQVm|%3;xd|FS2VXqhs_5uKiLEnRO8hbNPqkcmHb8A+o0)>+qEX$Oq} z>9;oe-4VT={JvqPMi%A(dJ7>(p@nn#ONq(i7GOD%f_<>~xu6gz_(hPXhCM|~2EBL| zq*M3I83r=Q70u@*46uhFMo5LL8j}G4OVB3NHZFP3lM4k3=FD+Y-1|Sp$hB>qNjz>FPs5O0Fx;tO#UT za7!C*1_kH@#URT>Ls(m77vhjA_P#}M{W_~lFXgvcK%Im^5hxI7Fn94B(q5qh60@wT zD*B4h4twx&UWM%wOAAR$ENN|W(t$?h^^pF&0bDr0iM==P7A~hOAY4vrg*}INu@K8? zods97#W+a6dCWQZg7RO}>JqY!hc zvBrJ5KS~tguGf^TTF8G!7XVd{n2a$AU8RPp&dhtr^{Zh8!|#xDN^CfFDCvQoJQh!d zmCA{eB@Yt8>G7k_5ViFVBcLGN0`Q(X#ih$-Hf#5Njz`i){a2fdE^(qH4b+dbCF8k& zNi%)#MxmO>Zp~V^4MzsPI*r-XF@CnhaBL*+6k{5o8m++yarpXHWr(C_1DdMmpATM*?KwGerqNi@}i z;8^BOyBBVo=TZoNDjZx}c58YHYXRgM&o-{Y7+Y1pBRqHaba^26}_s-$ycnbTIOT0CoV9R*_ zb`LOQo8yz`Q*blJuZYqDs9#S}11WXSUyE?kLv4ENJ@9jLUgi8R|L-Zqni8CXU46Cp zZwlC6q_B{4E;%1)EV=OgyT9r)KB>hIrffytXBrcxvTeCmxPlx!H_c=k%f8P=p*$T8 z{SQGx=8@;{`WSAEsF(m~1zM%NoqzpLQ8eXc(GY@Am-3md@b_B_ppvuv9#C*{ctbAO zw={JHpFI}>M6BcP}&)dI867!hACMRJ`VcP$r|5N2vi%s~OQ z?)YejfXJyWM_bd`2ecVZ@jeC&)vaVpnyudpF^={@ej-WTP+?zT^nd%WE85<&Q82Ll zT!bL;e(WBLfbqblvH@@=X->&Y@_DzxEdzY1h)J)vpPnRvi9YBEe0wR+0$^p!mXgvv##b!l-FHCTDkXUkY1!tjy*i zV*g1^$c=OkZRxHS8l2QGM<-3FEZD940Z3n^G)K@Zu-E6oX z0E^RXskBTnr^VtcPNnLFmPqm(murwklo}-3@F6U0%Z0S;?YLbJ`crwjLJ^oJ9KGuNHr^p zf0`~gT@_DUrPr^8mmdYk5M)Fz)5G!r0R6B!cbN*YeCXW`?5(40e~P5bb>&TUHNx#f znRtpwjrDXC<|M}BVo!Qvm+fO{DGHgh@L9UUb;fH{{Eqntt_}1e-`;KhSB314_vM0$ za+$+g`UB?r0Ndhw$1wP3D(If&+AR2`fZ+xru#>m}2)}>Vf4xi3K5F@V;)9N2J=cO# zm_J54*PL!*WlHQY*E|ZJs;R>Sn5y>p_ycCdL8mi$q~vs8g^!b4+RkSwmvfs8^42h! zFhyfG`M9}yphZZS!CPilUkLyD&+lh=9|idhTRhp`sr$8-QLv;@0WT3rB<#uWd z(_6ILGVXV=6>KH7shG#3bN9F`E@`*jX66|gq;4@m12*dlsYUQi5T!gUdXIHaP|I*b z1`0Gn2nARYj-=^&v360QyOJ1gD!^)4_S@qYh$qorZeR9QX+o{|rfd3kc^bE~M{AN!eeV=g>yBjl>K@0Q|3+(>~9rXFSjdPYAOt5 z)1u21Z*yz6dln$*+RpY!7I7=-;Zh(BL{oBt6J@3Md>e#p5~!PJ^!=O4g2-7|%9h+n z7>8E&{bPtW^0R!spZFimd=Cw-=3F)KSG}4_C}i;iK_HV~H@>nDQf~!6Q((4}TEPk- zUYOi`2MHR(TH)X5dCK_E^4U2~0a=(%iTO5M*B(!-;q3C<3%A zc4Rh@!cN_}u0d40=dx%R`b}qcBssQRTGECx9(eyq928kd_7wkO zqZ563x3T0@*_*l#$;yy1^!`lQOJ>IXizuXmY7tuEjIdbksl(^rB2uN3z(Xl!R0*nC zdJ@pu-KQ%LfC8m@VkTch$v-WYD)z5IX2?pQ*rh(*OBHYL5!D0lJ`4KHs#2IwC#y|L zaRQ-0RGELrb$<)ECTlSwDBBZp4|-})T4!Udwfjt2;+IOLYW4rWiG+k{9gsNCmr1sA49poccCPi{5JxUMgu7aQx`JXkyz3aORJ;L)n_2cR# z-g)YQkRT6m7QrtKp?mXSj0cq@D3QO4-`xBr)u4`qDFDb9>wUIB^nxMqr6|_p01&rp z%l_AY9;feFHz5ghv1y~qz{l`GKr^VhEQO9nu_rC`tdp%HNy`usi4^U}Y3H~sJe z;SX}%HevkGg3Ag6-lI!E%lw!j&UJ^xD?$Vp5GE?i+Yx+wPl8%H$IvRSykA?o=S><2 z0|E7_m*5c1k$)XM(!*^FdqzxvgbY#n9#{kiq@TB}$N=$!7YToG>Rd92J}ZLnUyB)AxD#ajdu($+-uPIA~p;Ikx`OvRQIL?ujn&gK!CmIk5&6(%`yy z{>pL+q4%Kt`?RF8U=)UQdc4iS>`7AK2q4S(DROY%a>~%{d|uWNBO;-|kT!>n?c4jj6;y83K)=xh z3vxSJ^c4i%np*x~hxT}i;qWY0=}{S(LS2bBEfZ0~`u19aiAaA(wEf%)DqPmv2-5Ly z1rl}7rf=11Di+RcN~@Y${FdPg^V)?4r!?Av?ET9s2obpPyTh!nD(5R`6;7M~`j4O{ ztcA7>YErh8lm<)6g!a4y!~T|F^jlNVK8ss|85izz8-ycfd$rP)LSTJ(5AardQF{PP zEs<%Yt_ZXp>1{cFCO`HzWFSr7XBGJDs@MD#%`12Ck5xW23NDkk2qhKk?nz}jVN#44 zO6JyO6#H3`T84^kPm;Z!fjWj`a3OVBB1PcqN^EK>vmx1ns5GID^8eqt^2%W9qk z)<1L>$uYu}3%GSIxjBI0$8b>7{!c&m8xurRD-$rSoLn*__4y8DH`heRV8j*SejyEu zYo*`E2zr3*vu7P9DVb?oMhNNlq8$JP`dbJ<#JuO+iY=}x)Gc!r$hzNms~2^}Db&)? z_MH3VZS3|I-MZ&Ov^(X2xRqZj`5$o2%2cAIi(Czvnl=yM0E zJ^EF`f-7N=rs1>$f>ZoY-?~qf|d8}M}duN$M(zp*pL1b`9_B(plJ zeX&XsV4vaC3AT{yofB5Q^$qLNOkuk&liljMIZLYUL9!Oxm+Pn&N4Do{zA^E%z{id^ zOjgdSuZ#tWJ^;hA{y1Xx3aeNmX*5Wr*^4+5ZCic}*znMOf<<*I`}i7#-FH^QFva8a z?5ms{d|<(!br#uoH+A|mrO1Nkx9NC-8M6R!d{kmmT^La3?iwaeIY=Jg3C(JC=vlk7 zHh@LE)n6*%#y6Oq$XnLo7NZBTr&L({zZAM~%yB8Ky|=Fb_bzl=mnze>2EK#>GCY-S zOrmwdUhNv9)F=pOKnT2#)$(sooUbAp8`eJjP%cv8a9=n97h;~vuD1|s6}%v2E{(gN zd-;K~y!Ij(as0LHRftw`lLZDd1Cb&H-a=7GyE(4)cYA@^)3tIwTn&*-_YQ#|sWXVd z4Vo~3!f^II1g zEZCGy6*NmpoUMbv!bugA1P3f8$Vm{$drdId06MUYHfjh9k7{yQ0fj?WuY$`AW^r4q zN|m|c)`+B+bO-+a`oPzEzDI(czEbt}1ZBZ$DrYXnb$$}28` zIIZ<`Ah_~no5B0R;>p6S|ArVgd{Af+P&cV^4PCsG;?BqLlUqFXm*m>_?VVJxCQEFg zSr#ic&1(xfRF>uGFsbgSjIz>hO0~YK9C31jL~c!jMx&F>n8!~Xq|8y{@=GHSdmlk; zi0R~^0D0+&PDiL&yd^vWNO%clk+bFe*}-*qkzK`# zU{ZLobxEd_2?C0eKJ)|egL#fIdQ~hM@E)Z468Kvl9=flq{}@ttlS_&fCPqmDR&rZJ z`|s~0Q`je`f&$Bly#+YKFu&Wq*v0&NdCqI%S3i4Ut_QN(a3iB_dn5Y`s|eMun6ZM_ z-|*o(5O=6?Uz9L=^`}qBrR!{ydTV!4AlQetSSd_=m&4a*vbi0|*n7F;a7fcpNZJR> zTxKcOE;o=emSXCP`z$WmAH{IUU@kMN4E=Bb++KRK1Zcl~LEDmj(YPjTt;sojfl_YCeLUj!ZaNCDn*GG4(4w*PW79*s!Rhw?C z)jwy@Gsb7_KGfvC#JwC5z~d963;+*NM8UJPR@^{Ad}$+DEQTeg0*@!Y-6Xq4;IBfb zM2ab)DCJAz{&E8iKJ1}HfyJ$VO5^RWu@m1xm9HPv3!(R?)jsuf9-&B^kUjb zDfS0yZAja0Z27H%k`-wM0C->FoqS65*(-1526DC2J6C_$UV(mh*kgITbBNVXPq7zgMoj zcLiC3O`!|jWekH-vgPf^Z(F&jSiw2p~0-nYQ{RW@TGMpYF+ zyRy~`V}lf_Q*FRrAal{Ur6vPl_RSYLC#&$@_b-ws-}23lM-EuEuI* ziU_Qj-XM!IusMB$2el3CNV{-#=GzBTXv_j#i9}{kwo;4ouE2|oLIjFwEpy8gLVzBP zHb%&}5EJZ3-p%SrA{lLp-EmiYH$go;KgDS*9o+G6%1L>BEX<_2Eq);t-Rfp zy{L83{jdKi*&Eht`EQUKN%xuOso&ui7L;CEu9Kl!>0uAo=Fk@bsL_O}u=Czbo*RAeCM z5@nbIrNnKRt- zc|nW!!J>@b?O{9UWWkD2tN+h`rMMQItdEIp1PEn-^Thf5wZXUtX&9A#E~!fp++J>Z z-b->5@mwZ;xP23m7lLzEOU5nl+2;i>A*TAduRXH)`QFXk{Uz7R}WH zx~ilWS^^|ZIoS5uxN%X_$yw{3D5(KuJTD5I2wQ%}H9Q*)YgOOYZBAW26Hox}vm%$- z^FAXaRJK<%hh&I?`*Ml8B5zZ;g>@kU3~GhS?LC{MfR43UiN@zmIiRvR0dbpfY5*mj zX$_6qoK+&(U}O?jBIrUgT}$@g*?&^N)7I1og0i&?H5K@8QseqR{tq~{KCoQ-Q-e*T zht}>xCtwu)>pwbYDXv`ebG_&ShpY8Z1wv16K$tb6$uL@y#OI?)w{Ne+0>v@!}mkSaz55&RQ@ri zoZBHR`-Hdmew2`QSbo0sC+#VR4n&SLp7*$~4r@zY(Hhb!K*k#5BD zJ}*P%VCD&aru2lS@P0DsUzLEQwvzK6(9_#juGGGt5d`G^4Ybe5TaeKMzRk*lLU8@P zwdeg@q3>De@${rzBZm)=_rQ{;aZmaW++t&nulhyxa2t!V_Q}4@?PW#dih`t9yoBVT z%KZvkwN=*29ix8)=ZN<%QU zA@XZ)Zy5}=V&F9SZA2gpcOXwsadEyKFqN$+%Rq$u`zruyNapqyv!5klW`&q6|G;y? za&o>QL{sL5irWe~s*&YdB>^F5eRhs3ygEi)OS%UY2fxrk+)IU6Iz?_VoAZ7$WpGQp zy6V=l!g<1qLbNmck4KpCs7cN&;c|JLW+y-e>AQo7%p9Fa_sqqJ_?+ zhKQII!{}F@tj@QJ-pUqCEdyo#HeB@n=5PB)$CM6})Tlsu<3i}R+&ys0sB4AW`g>pn z{Q^&>(l?}kiV>{9WP+Y{1PDpCLJDuwEbshXq1H9BSWa*7dne`G$WOQdR{EvKT}z3s z>8&eRu38W^OA2$sdmNfmzt$Sb9@zE!6sZJ!A2Yw#;&fj*Z3O+R6cECnokm>W=ePAO z@Uz4~qsXB5{(G!x%lR$*kq|~e?OG%tXfsv#s(yWLtf-dw ze!qpXNwbCl1a8e)Ki}gXI?@8lJ{!2|7Gx}`K}FcM?q?vc006GrfR=Fm`x}1pBwXKH z;I6GlV>QVs1+mFh7n~}80~eh}l}yV{o`LE^5>BM}uisIX%M(5bcb|XWd-ealgqod# zS_+CPqk@~u@~gVnq{eCU$ryZH7L5EeN+Zm%rgAEQ`Lsg+_dqFG88j-qOUh`KybOn(4O~3~Rd!^a>?DO*MAx{aHL94p)g#70g?0&N?%k z7|44z2HVZH^Z*(UflP?OGufciBm@ohrxts^-(cukcd-7DWR=3IfZ61#3QRr`lLcZ? zGO;(jQUFrd^_ya}j3F&-QRzCL6O%t^v-b^k1nLR4>%DPfVl+!#XqlTzvu8xkGHfXK z7(#?X@Stoi=Uy+KqCG1biu1WCq>57acYW5?z$Wr-f%|5iI=L~6U17UZ$-<;aMR5i#$kwD3n=43Zguf>RVscSd53MGb3HXt4C|Gl=xow)szc?wxGnT*or< zR96M`D&&=OtGq=;F$OisXKgyKvJbp{ggT+jZ&A{Z!xAdY-|;tuoX5RM{Fz^+xl{g@ z(MrtF;Iy+03a&sh~ZyD<_x)hb=-d|LBpnM(9m1 zLeBa5E;k|y>$GKs!2HA#u6ps?CPc?YSj)nk%bNaK(!6C#&imjO@hmU}r_3$ws%i|l zHDQm{-wx35NcBSfs*w1(DC3rx;5HB{h*e=~tpKS7S+9&U47Lw$(lvS&rox2OGO@OG zn~ChG-zJT0lN%nqG1TCKuK0iczX}-iB^;u-sN{Bf_rsTo6e@ETh)5@x;^*SlxwZSb z1lq*P1Q-Pvuxjtwe6Dy8~fSTNik|gdxE=1LdC8^mcvE zx|71~_wN>9^u&S)8utOhtf+*Ga>yl>$Sz=encgz#lYum0K+8~HqV6$<=WQkmvM0ZR zK`e(dBHF?{W;t{Z{R2jnGCtT^XjH58w|fyJ1Vk*Y_p`kUus_Pe0s^KiW9xmkV_{CV z4G_WBJjl8OT3Qp8SeC5Ip68PLNgOu61j^!-@^er{Q@GDPMj5my8;jn!YJHYCH|LgV z(zQ?)Qh>&SF0b}rLsB+P{o0KeiZCebnP+3V6t1S=Hti#CC41&zgU&sx-Z+XD=$~Gq#>odU70lmX$BnBf;uMC$HoxE9UP89$bK6@kgRcF|%`;*D zXxl!_a*u0Rb2e1KN!6UeJO;QxZ%zU%QPbgHu|0n?zgDiI+XFxTLBPU2z@D42aQ&##liKKLz&zAi*V(KuJX?3HjUC-aL*7}9+P5QMkp zfk~%_A?4{6Nr%X92>TxH*_`hgAH$z-MI6Co+4-OSUq^ijdD<1OH2b~K?*uX>G*~uV zn`c+rireK;8QMaI!upu^L-ThTqQ-b2X#BfnTins|Gq+@odp~9u5bFEYm53(m>(|zm;9WLRvYt)ck`3# zNiYn^Z+=sp$xgE#k6EVA6u^6%#k!#OLDJr5ejq-jwOwxtdQs*+f2&V&utb7>sI646 zt^a-2@_KII@Mg8af+$2-Y_)(ho=?~Mvw48~Y`7>;A7~=dP{U$(O=^Nq^*NYkDn;h- zlnU5nuJuK>P_1tPxO4F3_RS~P#`9(oEWZm9E6B)r-A{tiab4qhd2mp!WpVL0g%bxi zyfT$_Te$m)Xl|Rszts{p6d1%{yd}PI)5-v zdNX>Agxxwrvm`q_{?&TOm&|@P6#y7p6le6?1UAH~p=B5`8^!(tSkFe+szCiEk{@>| zm%?P7&`;65f+E z;$Gde{YnYIvL&;82_d{KI4+W_-}GHpHaS>13ZIt&$*qfky-W{QjZ3gXW~kJyxKhC2 zygAF;fy&FeslJ`}&&%4{!qjbDl<*@0U@W6J#-p#G4SusaT;)}i-Ygn4R+X()*tfTL zpZCeuvTt8?P^?{aiXUlc0V(9go!#?&w;SfJ!^g*_WFfMwOzMGDCF zPww|~X%Ihnc#GdZ!wgWKXDZ6CxF;471JWzdLReTvx0VHv)*Nqo(?M>L3N8=4q#_$< zff3o?&(a1%22@;gcGG2B&7pI)ObZu?rT;Pwh2LO}LeYgGdR&a-a(IEAdF zkOYv7f2uZHNG1RJkMVJ|C`TF*SQb|Q>;Ex&u+P+h{jv>>I|MV*9E0H^75x8c`hQ&6 zz9v`@itocfEgC6M+eqQzAdynzVUQBSUyPvl3|=vUJ~MbV1NIEwGQdw4;iQDo!T<$e zBZ2{dFzWZrtE+qNsa^*841ftn(R-4+8NvXu`VxNHbP#K-L$vBIm4NSHw%g$C~028a8g<1H=7NZ6Yp{ zTm2QneRewhSwncA)%FG@Zp5{eT46wL$Bf9;(kJXY(tDVc$pFbSzaT*pFrXoD>X+R* zTQSNoSU=rjl&xqA>@QHz1%3X>4ghsK3=}g2w_*#OCGwB_fuJ(A!Aaa)x65wF{CEXj z>2i*11tcO(8u~>^wy{3<7~-bPiggQ_zg5Wm^!h+B{~@S7@Uvl%0+we8;AiJn&2Qn> zwD*|;;0glr2McJVm|9{3DB^$jUj${+CP3~#L(!(Na**C=@f%c_-pZDi3_@7z&xn|s zVnXfuYviAwvI&(KOY*E>0^IprcKsb*2&nZBMH*8z{GAc5MMo6Z@(OKJT;Z~QvggWY zf&1hi<^$b|+;G`Yr2U7tJo}GZ$$JnmLB9_tEE@=k`RHzGq4NEm1KwUBEJLnvEuX}J zRzAOf=-!aJVJBr_WPx;H{Cv+n`8lyr^zz?fg@+@LOY3$YvMSuAXJbE?yTe-D|7JJwvr| z)XEW-VB?9JKt3=4c%F}m_}K&Ff6#Ic!p}J2l zpqetPFytE2X1O5>jRJ)bK^^Qk==6YrF-)p1&h_yosPzm4Qvh>OhKUYt(1CesmuR1& zfuR6ww|erDNd6fi%1&rt4Zl+GQXuqeIE4(lqhqnKKVh#!e(-leUhWo8%U}kCear*| zOg>^fI%tdz0MMYMtaHk7B1TJa@4iy z{*g=E->+Z$Upz8ab`d1_2PFoaks*llq>eGL=?>GQ&Ua~jwkFFqG<7a?>qqMU19CJS6hS?$A~XszYfJTKR?iGy~%o? zr#+e4_aqk+W6ZboAgrHDp;iCuKcoV_`1?DE2)pRWsOhQ9NqtWM@l{9@!U)ItnE*>M zk$F{dEi0>9M_?fif4MPIt4o<~W-2v8Qn5G+e8z?7C#0Cq4s#~|^i1g@6{4c$r~?>8 zD8TOM7Em+Q$v&H{f4ZA{``g^UpFe!of^G^((jbr}5r$z|Ub)x3<$6DZc6crTA}ge0 zkcOtB`z*7cS%Oj=DCt|4w{@u$JT3pOm{>REx&;PH1;wRl)D|(H5%qM%L3|=60ihS4 z-NF<~uZD+9Etk5rKdFWwhcdB!pB2d4(THA@+X!U0$eu7IIk*nP0hlm3p6{2raNAW1Qf{ z+1`j-UP*)V*FC{=<#KX1Zeo>CE?@xx%M?cjwJbQa&yv70O2k0-1PLJ0mTuMUVO&9H zk8PB$7?c03j#ACJ=j}xY_Q~{~8-I9_$F9w}Q~(WXNsinhmf-n92q6bsC=Am`>ei+L z5*SuY0^D*1E5;~C+~=nD@G>Nlh*Rb^wr-)99b5`lv;s$5uy@6f{VM&kewP|i5Jvr0 zh!rgU>pyDyThRM(xMj`hI9oq+BJ>{pLOX^OcKqy5Rj>gC|M=dFNRk_#~rFm0!}&V<=$OU2@K9{9Ukmmw0wS~fhhzllYAZD*m z_!Dw(5#*GW-+3_cQ@h~~X}xDiMDCY=SKrSF2Z& zZppEqv7kf&Yji8g<0XELMG5sDB;+)$`;7f;_>-XUtC0r9E8y31$?4?Zv@Ou>O`SZ^ z^-J_-T)hdO*duHrMC_uhD$dBu0DS826+l;QBiLY*-3=?e?^Dz5S}{A@FslYIG*!+Q zs3|D-kFMvbk2Ls^A+u50^X_B z5J$=hFNVq#US)r_=yxjsW3=)tnffUz8_ssY;cg_M1Tjr~=NJYeX6rLl5CYKN{wJ$xFAbfn?wIh6FJi^WQ z68b~fENAVm8naMNIDUW%_@$pwBo)DU6YdrXVHXRA5l?BE zkSt5MY$T_{4_u#~F93#^7Mb8xLzndIK)5lXN-1!D$|h_xU>>bb-lajLyX?)cghMz| zM~QViOzL8D?4S95*LAtD_U{A@FhHBpUZBvgwI?i-;)@f3GWv@!OrzU1AO@ivdV!Bk>MglYKEzh%iJ>%REfX00M%n~$JO;cnypzfoys$s3C!%SG7XZ^e zf~4${0KJVMv@#>XtgygRF6?P^q;{Ba7m>70^52to!>XP3Xri(<$y6mM#2dF(+Y8-Q zn8Fui-BvZN@6OrHI~kecj|9d}O${pSL4X(5ZxrQL4*N@8I zzQAI$pb7l_7R!)eGA|!E^GL#p46JN$u`yG~49U6)pKv+$;;ZjX{(0x*twj~6W*VHo z09au~SRg0gc0u!?ctX?jCIsQS|8O#9N}3;~j!yOhUc1ku7vdM|iJew-EAUB&T=TtD zGyAAgkW;cEZ(XQto&oP3`)0Y!HFt(Us{WP_U4MWqt%B9Tezzq+|Hzo8IH^k4OV+ur z!W;bTV5LO*@6N^4rl`sae2FtmWrVXdu+c15ip-PE)}ljORTIAns3647-=9|C`Vm9! zpk8>NJ$aL*uGXhmi1H*FmAFPGugg#r3f^vVZ30!?g*UHLTgVLq*DWj%Z9dn=PDyrIdNGCWorohC-3%!mrknIDMxat3XJ0 zVQEQ~W6VF0g4$>gcPFX?X6tl6TS;?Iu_=E5Qhg53?-+OXC$g^!zjD_~rhs-aoc_;3 z(578y+Em|AH1Pl^@N&yq{PrQ#WMPQ^`j2@o();Un5ur5;sJF}z;`GW2x0gfwL;c%K zf91nu+3V-hXNlh$k#l=eCKNgU&;mlY6maqk2Wut&)zo{^S1>^HP z?gfB=fL^igd6lwjQ>gH*c4{mJm37VeVbP)N_B=>?bqk>83G!`_TYC}=o(-g#R**dw zz?E*#nZke+{3A5|RshA==4xd~i0Jf2_pCd=FJuZJjq}1HLk#8fHea2$BH~*l_bkDU z#D)7U#x=(|c+u|MT9NoY@1;ga-Gk_+Q8s$r_gRL-F4PgO6vZxp+?*FVJC6m|R3Iaa zVrhF9=c=&9*ko#`Rc8@xLE*}ddh??pti_yBdPV0a#v?zqiF+0`MR#uXMS3_5GJrN^ zplqFCuczjW&}LxQoarUqqhAhc8kSaOEd%^6D5~D(lv7KW+x?zDK8cy<+b~VS$TNFT zH&X6nO}$SP?~`f&rmBqdZ`MuniUp`jRH`ZKbAB?}9oo^_yv0NGaqU5xazpnE(AVcm zv&j;;;L_@c$gaO&LLgZ~!Cgrbr8Cl-qB)>xndsFjCZ3Xs7n8HgxXPwHx0VQwpr8%n z<@{Y3Te_$3E_c4w4e$fUM*Y`uP@%Sl8lb*h7hPm_HBsXF?!c(EjHb5lejn7zRiY7Hh;fwtn| zmq652y?n7nZ_ExHhv?#d(4@qw>iPx)ElJURv8ZaIqPviuFDtN0E6jV7Ov>|#P&;g9 z-wrC35GH1b^I!Uq^;Cu4jqeDYf?1lpIF4iLPpPT|Q^K=Az!9MM5x81PS0P@Sen^6S zG4J5q+Mx=IZLk)nUF2CNH~MID!LZ-wGDNnX(_Y`00is3r3*9blPGDyxVliax%WNFt z=SVuHK)E>B?)yqtvHVt z5RHiRZ>6+w3taScEIRIcMvcFYxI&1+L_V05pRsn^rJCR;046^KR2!mi3fSlVqEU{Y zi|5)N0!`taWka)QU_LB*ONB;hZP~ z{alGKuFyR9=RXm|vZnc-pGA?W{Vc#04TA+3<+S}9X}K-SJ+etxw%;cU;Nf3|jc>!X zHD?Ovv=Xa#2E7kS!OT1p6$`<|J|I zfBPQ?B;DqJMo2ls<-wqnx3%pErVt@r9M({13=&fu_0j}Z1$dA1>|2&g4iRWIt=n)) zKSQBjnE}{CxD)YOjVz&nnq!-QHWIcHS;2OKQO9(1;j?6|^StE#LV|PVC&hI1TLxDN zBKYyEGKpW5tm1_G>^AHk(LOmZ=70A8c`y;I$_1#+F1ghwn7dA0{tz? zVTRAG?N=Qd3>eVOzmFHsw?_SAH>;ls!BveV!~ky>*H8*0*HVd`zCDz+XkwAVKFc6L zAS`%B@h_T618~7Tp#CXdGDFK1i)O+o;5`b!HwOXgR#;b=Hc{a%2BbB|2!E%3L(=+W*514q1*|l-I37hjuHAcOXDMz3E~6N9MlQ_o z!J@x2vZ~&$2c*<5nI?CD&IU>gE8jG4Z=V}S0!>7M#~^<~7+S2a^IsCkrGU*w`P)e9 zGy;B+75JY_k!*wiFl)5K;%j3Q%CJKggVKggBCY*{igJsW5@(+uU2Ht5Uv4(Lr1lhl z`;9iGDpcYeDZ5A~OVspkg34wfzmFP`s*{ilYJ1Ih05T00UDYH?F>*HLd3^MEiIf>l z@4~4z^o_aq7&zSts|Z%+?>Sgdv9QW)KvZxF*3T*u#V$VQEPcFT1xyC~l1uJsrd^#! zrFFJZOjcEaMu&@{;ITRH60NsH{q7_VasmC{c^3^M8>NCA_<}HmrCy>?5stl|1x8Xg zg+8ABkK2F?ml^3PBXd9D!T@Ta?f~b$RPokJ^vw}}Jp(EmHpm82;)ZbZ)|=benb4pY z#kt2MT;0W2372+sa#GlB-MgIT8KAnYodYkmu1WQisx63K5)kSg#FVONsKG5>Xq#JhsW&W&h>eBQ1+(k;X=iskYWC-$CMm z=G|MUS8(@l>O?`&DD6di3N$x|G@l?GZHmp&Sq0qO0Wv$d*~&kM;zl}1t;FpXImY@e zZk0v8P`kVZ$CyPI4D(M1`9r{as`#sME?VTq)p>Fsx)7-N8X3x>#_j9_awcLQhH`$?7??NwKP zc<*F&egZC(!s_Z^*m0q|O}7mSG?+Icpd97YU@R;E9DW1`)zEmr=PwfD%C2##r`Cm( z4Luoene`N&AzEIip>lO+#5}1_Sgs{m97tpH47?c(NREjDh8jp4xC`;MVg{Xn=Dx$K ztKIC^CE*P4W8eF}*xoZ4AhFE6z9v4^wrI$|RSmQUitH}Cfz$rif6!D|&(~YJy|q-} zHc(zmK^f0GpSS(YPYptP=5uRVZl|xT?Im~X*zD<-)|~U9jYce=33K#!uFl%3 zebz)$`&@lX`yfbGIRWyk1ev4dkl;@%#p1c8Nio2b+mgBXIT=>__V@c0v^cIEvVh!9 z2$1`%5hB`y$jKfUMBNlY7>je|{{Q_OfwhvJZq-}gzoFT$YHRAASpG4@urNSvWzm;_ z=+-Q1xg=SurJ2Um9xUqb z>}{+-PMXe zVhcYBA>~-9{h6OO@T|5r1;8L^2ncO_-)-z!O^S-I%Fn)kBY-P!b@)C*V(a_CJqY)A zd6+4<5QSD8;e3hOq<9bJ<5m3G1Yja;%?XxgU3g371@hdA$w2Y5iNf2Kv81%$4;PS57R$| zFeH(X>$5-qYJtiC47XSg$OZQ^DTBB?L!$opAk4I_0F6C=rA3?Jo;_=UZ)>#Z3Huh2 z21Wgr%M^n1G{vwN0SW%d(`;6%Tt2GF%3EY{Wb zo~bA{W$CJ?y?xIc3MulICC8_BIG;@~;M(%(JvklJ|R4o0>CF(U_=3{{n;n(x`9BAMZ2s75SX8wh=53MEdhXLEy3N-_xHqQ zhEu=#{+ZMg5oMFXPb1=FCt%Q?*D6z!pQq1o=LJr~5ca7M6h)%ux(p>C)>PI@b;{iTr!n&s&rCJBJt?QCn2sWMk& zdBr`R;4Fb!_W(?1&I*9c7-bVgv=wP@b#5&xq_;xu83^gOYK1j>{87Gi6U(4ivBF54 z^jB@bZ7`bSZH}fB7ojQ7$JQ+w%GPgBqh3`FwgWC#GV!Yy=@kGnh-M{!CP?Wf$JPpE z3s9{5BYTJ6@AC&@Q`Y+=+L{CIzy3p*U)%szaNg?WaC-=8Yqctj8Pu=-A~KiKZ0Db= z+DiZ~4ee(rYkJ%D=5c(YP5bik&%&|EqHmQ)6&Dgt(@7z2Z`1BX=6m*&don4Y?jE>I zz)(P=WAtj6GW84GBlB@WGXBApVAY5`=mA$M=l6LCpe_UHQq;u{B+G)~N;+}dUH+B; z@P{#I0QzY&n}bacBa>RWtYC$j=K^AuIU|fvyMc_m(8+x1q}rN zT!PBeZO#f3UDv9KP;45 z6mUDgxnCE8&>^&a!QyT2$GnxPb_88*x!z~%*R(l@{5BB&UKxh|R@sj=p{>Hw(Q&d2 z&s6k1TTwK>Jp?Ii_W}V>-?psi?FjyLQ~m-A`|Ro;yr59amENg9=L-I9 zfq33pTFtu!65iUg2=OGZ3Z~PEf=zR&EGkqSh?aJRUm(}@Tf9T#Y>*zAZaOulQs9HV%fd2eftZ!@e!ilj{9pb%(cI2Ti=w5AqzViA^XA#$ z*R4Xu)D|cDCDhJp`!@Mw-u^8Drc4=d4y*;|STWMB6>c+D$o`_k%i}q+xj!6IP_X2= z;mzN+Q#MnW3QdMaMalN;Lq>UQX+W+e40~e{?Oi z5o_EgviMwpE(?o&gb`ftS@zF>8h7N0!&}AR^aEG;dA$gvoRkdIqRUYz;ns`W?MLCA46YM(7R_@Yo_|^ zkD-g&GC)w5%L7N)v+iI?-qP={>;9B-5@c2!G$7+=ovn!b4vc`QBs~!VkYq4)O_K@l z!#U=a^I){Spq7!(+Xj5zV|uHwwXEO3NL@~c72HD75eKPTwV~xsxV^`V8VUPM`U>@x z>MbtF1Z_e?tq{8WqtyUFtcdcGTQ%oyp8+%iCr`db6AZM^urIc+P|~LNXAPQA3~7X4 z7q|~@S1|pH3^W#?e>MP@$4F|P3H;m>gR4Ee2OCA_@3=9z=f{f|J?44w3|KePGUR>q z`nxVfV}mE)zy8z9qWqpk2U-BK&;9IFvVZ=SEtV~EglX_LfTe)ivSDc|?I`PXZDa$`~Vy$y4Feq2}OL`zM z(IXhCVCxq=nZ{Y4ToIOQAoy7r`ez|sU231(!`g(YWoIErGWAE1+%pVQxg;U167*Z> zM4OK^{6tsH{Xr4n}%3k=Rxj1J(=`+Hjg zk=md%RALG1zK7n1M*A7KlJYRsSteng7^&^syW;Z*&+?)K;yugIXZ&cf47eq)P)1F7 z770lcuD3<(;=lSIvKE$>NoUD=B^vj;h*A0;zC|SgYh7`#9MMI*^)ud;dV8PCvv@yv z`H8i_kJq-hP{ZFJ7BLu+WQ+vx!+6EM=uKttx$LR$XY3XhK?zO`#Qc)uIFr)e5A;<| z?Dl#089AMhb^vcckiY#S<<)2_0rs)!60ArsT>xmALv3v@&Ml_~Jd@%8pm=|8BwF{a zbU^Q)4J6VP@vr|~Jb$;csZhK)w>CVk{grNE$sm*~c|4YG)rwxS-ZSSCT5wB~$Bd?L z^`CkH%sC6-7f#6i4oD3d7ne9=t$L=Oi&V@gM7ZrXU0!V&+5&>D`@qd)3Ed)M3g=FR zTYa|??Pu(>0Gat-i(6b!SkX$&WOh zYZVQn1o<5+ef$V3F2o+!}$JDCK*@(x1DYO+#YNjzttGJ5&|27veJFBMZQ#g!rxMq zJ)`MOTT{ye*BAw-mMUuy0?_IQb`iO!aLW=c6X@Mq>&MjcTtN4%pJfznt-_z{MgPO> z(yk>pZJZn8w?az`D!QK?KwHoj%smVI12G3UR~Gm_2sbE~vXSsl(k%TF5z^kzM$AAQ z3ClB)-quEN-yaH8G&vyPbe1cor>yree(6zJ(RLhEV5Z^}=;GTjer8~}1pBqf^Omq2 z{rke7x9xMfg(X=yeZW7b04XQ}BDZ|ApYR3n`YwokIdZZ55tZff3aruWhg-(e9M3f= zI90opuTXLf+U0EzP|LzBipkBO2e~h9G$O#Ztu($9ea9EeeYh#|`m-Rm{jEbo{qXa1 z(V1Qd3uCjJN;omU54H8Vo5Fn2D>WV_ZA1ehi_Ze{*`uT2WlWgIr2=CNXZ~|*xN$2d zON;e|4;2e9)ma+l`x8+T;uOwIzBKce5hmfL%nuLHU9LlHwgr$f@ZIE`n`CZM4hTjY zMD;kc4EhgG9lmd=_StCaj#om!!iLvH5W7G&KRH$N5D7%5=Ap zYWD62|`{t5%H^soOW zd8SfM&IXXVtQBGc+jKE=9M0Ja5%>zFQUV1*I!Rs8_gM-=YU&4@tPQ%n35&u0 zbdh^jv>;!l%^)JN1h#X}h_xwf4H!Rx!+pY=))Y(v`Tfe@Ct;bm?ZR!uAHO1V#;DL0 z`1ufxAV@`AlW5;`UHm@QPwy@7nfg^tcg7W_kkph@nfj&ku*D$QXZJDQPc&$7lcm%X zA9Km6ep@$nKX0m%FlKSUsA0Pk68CIPEooArp#Tm@aR?b^t$sT|;Aiz#R0B7%($;(rwM> zi*l8{+Je)!gMLr2+|MP>`s~Die8qhd2EXRdTP{OBW1<%X-algs#42fXO)@<>n7R!m z`*U-}EIG}f;uRxo{ruV$%B(l6De!!QXMy!O@0_@OPlQYv#UBrZ)`iu-S)YW!TFW#x ze0WQy0@d)?qog5cH+ON7JqW^IBK)RkhaqC1wfW%zErClCjRKyM-;z#TX+WKSHW>~E zEm?{R9t&bQjVc=Q^c$7h7vyP-tFXWHODG^JKRJ|PlnfM5Z9l5TJF0lk-qk!!*kno< z78!a~wNPwdhzZe?Qd-K(jq5Xvb1j!WTmmsJED;?b?_ zuHF~89AdMRRdc++5#m4S2%yT+J%0f-o$jYZR8?gQr2oGwosB`2SlD|E&H@Wahpv9l zwtl4?FlGXmA*G;Q61^+}uAn!Zkl{aV^l>Hy4l5B__y7t+t!y6nfZm9|)OE7q`P8!y zw*uCa_4F@dnuV}$tEnnihBq7L2lqc-Ci7GFC?rr~_)wCgimE9dw79h<8gr3|>`DuK z%fXr660WqwIm?#*?)-wq8<;HxEJc0JZ)T--!!On@X;P?80mTdMUkYXkM$2IVPek6Y zT15nd0+UvU_!2}37?3L=Rf`HV`z(>)Vok`SWM30L^{@XXJhzatT?YAE8!`RM3q|E& z0S-QIrEDlcjX*Q$$yq~TUrN~2d#2|sYkwyB^xzrM!1mUrEbKE{Qnz|7FwxY=6rn5= z*+t5k&plp6CYPufwevXqX=3d8A>lS$q&Xcf*;D&7R>#DJ3noRaHnrGwo=Zd#CE~g>) z_qi#C~Qf$8lL)AwG+`eV;*o0hZo?+@B>I3ftc&PE}S;D#CVB zV{KEDS~}|N&r@@e1EaOBw@@Z?_VzRrg(L32DXq(TQ3VkhHAc{&Of;P)gYBAK%!2U# zssJp8GKpOVH#;krYiRkcgVd(N9(m}<2Zah)w;W+=cZlFOx~)&`A~_JD!#JDI%BJ9i zPq2c_XVbJCX9otDUPeT?)q*(}+CikjlSJa~D1X5ot!-fX;KgC>q`*F;t5AJH$PSFR zAW#npX&GCcDno*FgAF67I@NubgD@wXUu!^KPm2&es~Z;)M0(Qn2F z)S}oA&5(!aI_kfWx!;sz;ZrPm0+BWeYMm5EyVWE2ijcl{jT8>6Ia436LVq^4-1J0x z!$6^3Iw>(|h))lab$vIhJ%^46(%R~wO2cAvQ2qda_@dH5TIm8$sFdx77NGL_??;w1 zlwiKVyQo@N`(0A~nn8#0vI_mnr1WaTy685yJi@l`gFrKWHYgGe5Nho_kogMP#`waI zpd7nd{-1|QZ_!ffu=kOAibZywdi8~S`zy9f=zwcjGL0RKk9Y4+nU{A#lQ{l^d*p$W zyRVZ)W;qP2e!<{XEwPBCSDa@nQXf*MY}cYvHDa(>JZAC63M1Ft`oNvq0CclzYu2%YCcBLU#OnMns|QJv&7-#VM(8qXlkERU0}B_KPUvmy(*pXuWXiHAho^4Y6SvP~GY>xr zU)4QO?{icsd#hlDMiIqj3XK=|$vK}@Z7&($%+d)L&pREaF5z!w8%zqsD=2?VeS{yR z7qz_VRWKxI@-thio+lW(*41TAC78mgV-eI5iQd82fuEokpm?wonV+v)FTw5(WXZ z=-aqI41+YbV1T2tzK8DNk1AePf<}f4+5@yMFjYVMB8?&82E9_;qCFk#S#oO?SMp2qcFefVa64gw)Zzql-bz|FAQ1V_VFV)C2D#-q*|QA$ z`+1G;%YWXf%HDf^_niUa4(P^eh^Rb8{&Y)ZRaZS_$(GGtxUJC^u^p$bo-1rASup$g zQAi-M%(607S%B#*Wo1vo=_vx1HeDZ=5OP6@-Y9Mca3qbgj9~d}y`A3sNtmq-Kc2|a zat~MAS<&EcCd>6r<)8NguZTSp^+0baW^aypx8?l|5gFW~-i|!g|A_SleY z^`91HOeGaLia4y*B5ZHP1sT)JXZO4c!0@-C&9WlB0LleHf4#jTwsu={mN3D%0KMgN z^ZCF1KWd^pyJf`*Sh||WmZnB@8}>8BAVH6aEvKyaa8X$hrN3UDDXh3Y7pldUtbp_R zis1?h!g{rjnUK7!lpTYLzpPO)u(7)5NV*v`XC^=d5hPZMTK| zqP%qLD%>`=R+uL;Q#-09=gIH?RRWA6Xxb@k0nlwaXgX__Q#6xQ7M-Mn%c>QCQOGbg zVFjzj@d`mm6KRVc1;qqKyzIbGcq?s~y3eo{K3O;C3Ec+5Sl*(w4{x0fT1!6DiJnw&(L@a$KQVe~$yO-d=?+ zV9O#9WImU5!Sfyt%3!uvNo|D4m2F}gxz8{sUn#YcPcCql8Kp5t*DRwh#ee+=`yi3I z>cBk;bf0e{Xlv>8FPGtBvd=Ak3nXse`kNn$AvdChpuQ~rOiJ_SZwz-Wm@UGnQDp_6 z>xOISvm_@h>@#lDtcyKBg8tKqsRRWF_|+)9t(a{i72c}QG(axV%jIiXkrLLjF$mg4@&C@MnzunoWe_`~2W? zQVyrj`OMFQbOjvt22z$V8@Qdy>Z<4YXiwcUt$j;M*2g&OH1-*2K`m>69|Wk?`+2Ef zPBX|aX4(-!>sK@iWTwc)HZUY)Fmy+k;HZ=HcT8oVU&;Y#UE~>!F#L5_5Hbx5sC!ww2LB#eo>|Hi&6!c#p1{8-SS3K2AVH zi!+Qu@GZIG4&TT6YoT^)OT#@&CY7InhHs~Ue>NmAJp{j}z2zn97BwsbRyYTvnvQ#? z$)$h<+w~S#+df}X);*p=`ieQ2aP#MNS+%Lvq8yydN?{ga=+x+VKD)K&b^s%(Ez`%) zJUb7(P%nSgXA4u{{-!}9y3L`LlSRts&mYF&=Svu@DK59U&khkq+<*PY1n7Hi{c1C4 za0N~SAvsKigJ<3E=jON;iKr`V5xRTyp44w_l()^gjS!Bkd8RN$Am?uB`!~9M>;+L- z0t{VlObQI_6RcJ$4^ZT2J6SfC8k|MWl`aY~|8pZ`rrY_vG!d}k-&IpIWcb6H{F%U< z{(t_Dm*|-Pv^Ak`gasLfd;B9T+?*}fSjs-T1rV!$HUOPhE<)H89On1-ifGcw?~X2{ z7~9+*dZooJOhqfC2G1}AOu&qkt#n=$qHl);r(_5ii*TC2s{-P)ZZ3mV>MQtc)Oqy} zSkqO<&NDU7WnWFrXOtLC>k8%GMb1|OdI&jXF86x_S6vF=()oNzachZadu5I~zu@Xh zaPr)q^T%g1XVgfLo?kyh2vb|xUR1y5eM_cBqeAR?wSb-ul{`fXG0zp8NK189Dei?C2b z_H4nZtr&2xsC*x!_BVY%EL(e4Y5OneLROWq)#Nyp1&l{Y zrMWk3Twh4>7RT&cZ4(zLoF^*EdHi__B{B{$FB;i{r{%6J;??K zP423bv~B@v2cw^E{nq}tNA=)?^#*CDlD?=mXI`}ntZFYo%yS^|C6vgfs|1qgk@4V* z-3}tCeCooHEQSyeVDpVmcINPBIE>A^TME8P1EgiW;#h#gExlY@XpvUqv+=CBaX zPTp{5!H8*EKO_{#y^ZOKA!)L9F}UJ^aTBpRg~fD$U$Iu6&_9;x`RBlke6FO4;~`euaVz*uNe`g|xN(N7epWp&^K z?<%P&caa^n1pelU6HFKtjlAtmRHW#){{4P|nyYLH`#vGwa{X-s^<7AjC+Z-9QF(d} z9S^4T90d#~rGBweGR*Nbo-=x{D*Lbw-n%qJq&PpVImcTSVHYH_xKQOPDLdi= zGDsbWgw+@DrPz27_9Qrcx2TpkK6OK#NvNN`@<2b+K1zjh?mLfmzU=-1Nkcm_b*Ery zjo6lCeUTC>J547=i?#2$O@IrbGyAg8`WN807>VpTr0?fwy~ zId;y#bcyd^HTrhy@WAW@t#^Vo{!Bv+Qse_u*#)F7HN;H4H zP2(Fnd1PFp#CnFcro+|H-u-v9i?)&!hEZM4?9CNc;(lx3hk*7xF^y37*04y=N z_h}Bi!3qP!>pd9Ti>&2%8kik4zMi^A+}?AzFbJyxO2vb5PkylhQcHDQ+Eopxe3CYh zm#6iD_nrKOP27`i{RDU_g&~iO>MYv6sl{!kUEU#_tHvqg>Z_N`p4-2kahPF~1P%Ta z=p;qF8{6w$PAIO{)lB?VYWN2%>c$_JfLec+XY-GI=|I|B(0O-w;X8oFSX#4yBBGS& z@CCxXwsBWeSIQO1?Ab&MciPr$NX@N|!Y+wiT($W{Jyk5@U>B0W7q__Z*W7Vo|9gd0 zfAjmoB%Eh0rIxIJuie>O9bFQ4ANEI_D1YSziAp|U_Q5I4l=OMcmH!5%^7tUXE2p&c zBvRj6V-j?!$YDU1F0NjPkT(%eL6Jg*(TL(^w47DoM zPO|!%8~Dv%N-v;7kONjVz%LBvm*N1!Q32+0@N99D%x>~cAvpMUw^&=CUAx84?n#Vm zmkg$>58ekD%BEDOyVha0LWuQEtEb%+8nPJiRIkaeEdqEb=#w8xO z*+{(`I^37g3){^@0)FqXH9OoJ#@1(UxyqzAkB(_*L8Z8>$iwq%=`4~>0k zq>^1#W)T|%kJd20_a|10(E7)zX;8pCZj>utW&0|qV)|C8zzx2Nn~Z&*T?`mBtEnRb zj}A)Bh4V2)B>;9sh?65ZZSQ1vyaz^K1RKZGEc9Wc{MahN?Bet6&Hr^#_igGdnB-m0 zDltyx-?CuiiUya}@7-Ku(N{?}@q@R=nZQJoFjcd4%!wsTITL}bjwO!+1v_35X zO}>Uv9M`K*{)r30``%}{DioyN%|qhHt4NeZENXt>g?m$i@-s1R3Fv8doihs;T)zG5 zTE*wS#CCE8SJievlY8wp4Sy2Ky!Pw4K7_G*jdht$@Fkql5W$}F5sw25 zR~#LmH`>p(OcYwBDoIR&NGJ5hoLHGRw_J?f9IA*u1)Th*vIB2sot&9FKt{ywZ>3h) z_KIf<!%tT^WHX({uNEGhY_+HD4wSD{`8~Iw#nH?*w{IhN^O^tTS%%>rm)OigiWiH!~OZ=zTE<|MT?kIMZ~`vnUk5KN2`d1jYdaj1X5h zCGZeZ8pmh(7}N}QGlKKX;Pec3O#o@R7>JY@0O??X5g3Sz5Cj(qJTUs($9<)@yPmG9 z%(!K)UrNwgexu*7#4T4csOh>!HXUSmmis!}{FU=7knD5@ZwAdlh_`5RPV(4L)Znk? zgVEXK_6u|F>5cZAyFe6gHE3=@`!P)L%Ne{F3~zlL&^msp7Z?F&?xEnI9Dz{whS0p7 zE9%)xa~7QB@k#J^Tt7wf!vFerA>&Gqu;*54LxbA@v{3*i&#R7UGbDS;*@$!{XV>Kj zVgSH3C@(7rR58+{-DZ9!W^I4-xGw2u43q-`!Zzf9-^T*B=JNtXJG#Ys%YOE}nnIFF zU!~V5uBMmvM0m zE1RFU=_J6(+Yqp{Kw=3(b5=H3w^m%b_F2IxD`D{SlK`56IjoT3Dy{@t!OdL}qjFip z(imb;@AH!NKF&Xd+=%K`?Qb&^vns=FrxSN9Y~N?jZi>sk2VZF)yC#rA@819}l3!H< z;Vmsdr?}5gp+NvCqtd0NU^=cjM#|AX7Z6Oweb0M1!m^QYIzE3V(258W5_%7`|L_0e z`@#Enxr%;dSh)`jAg*QFbk(b6PuZdtXWhT_7{BCmgR};bYq_5cYtFZ2g8ofK1XJ7^ zvBteDWc#Ybmmn8(!3q)(#+eHE?URM8kRjt^YKqJI#ATmkdz){2QCiO5^nmwoHE{n> zNTOkg5Hv;H&s%h58J}S}jS*|MqF0ZRt4hbf^M}x?A;QWEG#PNuZP$A)NEhpEm${XF zzW#97YeHBtR(LzT0q4ZTzlAXmG|7sAzszsS7wzncN#tDMeXSBe_*2WpNjtl_Nl_(Z z5XZS)t5$xg$&ECDK7<15*Da1S1@Jd_L8xHvS-vInBF%63!#~EK?`@9&~*xEX=QC zcU-fm=e9ODQFHkuE4VW3SoBzLlOLI#V}vG299Eb^4-W_GEfV^MPPhmbI9rQ{rv zKr7{~f_^GUO(%`ts+tlr-5a@LP14Wm8TM;{efNyiNL)7`()`l-am!ho7E=NMh5Ng> zY(fR3EexS?Z}Q;Hl67!B)GdmoC6-K>8QB19YU(!eAdGhgbQj6wrqx%Os(XuQC!gud3G%D5_j@{$REc7Wh!q?v59ApVQ zImq7<&R`4N+Q#J`5^+m2D%hKX-Owes_9ia13Gve5B*s5UU-Y<$sqnN2`4Ry6lY0u~ z3?b}mtC4+E2**=G|HM*YsWLPfdfcZ3OosmMP|JwQTFU+cR+X%H^!58wcxPw#3eiFF z7xK={W`yf3*FRMY2fSHc^yi=B)o3IS=Sj#7&4)2Z20VxC@x87yus<_WWY^0u9^`Wk zh`%0<3O$L}`3-!PcPkEG(C9TT;(kDe*>d^*sgr2Of#bsA8&wAz?17P8-ZEJPqfB*r zPTC_sC}3QYC~$w<e306|!h%!t9 zPfgn-0FF?}=i5$7AA+Q04*F$mP2rHKN5jWO1~bYc-E8i6=o5i}3vMzCCJd5WL~-N7 zc0J7yJP4~O(`dN_AltWM{uFbRc~;_iK;g2YfxT@lzd4`MV_Cg>W_NR0-Ue>jtCpwwggIi59Ki9Ifyhs{zoh z0p96pH+4D&1{sU9C`xhDwZ&o_8*AkMr=_rurYWk+U*Kc#Mr_09nT7m#r|6A#U0h&B zuvFI9z`XZ?`bD%T8vu%c$*xl&UaGv2x&2*c7(cPKKix;O=;cDQzdd#)1_UVJDp5`N z;S^9|;t2?UagtaT^i!_sgyr4uE{sM5lzl^xoBSIoVK9yh%&4>Xk*9VCcLz}yb}8K$ zg031;nLu~Z?tHUfiZuz$_N-bzY(^KPCQwtwHFx!FO(o-GpDl267YxAtukG!Zsg7K7 zHjdFr_NyE+k6YT`y(90q#1Zx}JyU?n47059QV}JzfK8H*8=51qGGm4j=fr_r?7$XP znOI?0?**v{qZqywu@b0otYkl%d~bX$!YqQ19GvD+F(HA8FB9)Em43Z?SWNBR)9$c} z^e(;3&9;eY>C3H?Q(?9M6##zkYPER{Z^5#(kkWVRe8!7)yJ%(_)`_qO--0a!A#3G< zMOLb!vJ7(4V;g(gJwLeiQGd4vP?9MSMg>PK=ip6Vi z7>7n1^~t+N7s-adZMPH48?^Tf0}-^bx9DSq>v!p2aLyr^ zEi}jFMh5khx+?sC1iEQA?md|e#l1Pzm1C*zew6V$G z7zHfIiGNxkvH+}TPs;&*Emrn1HboBbaE?>atfJPACn{trDd%t7y*GDKVDcp~_?LPr4w&WLWgS+`&ChMvwkF<; z@E8@~$;-*QZl`{l_cA0lPd*$q&o#-VbF0#Te#!DD9htWQ^-pQnC& z7HEvK&U%UcM}h5Oip^~q1;F8uv)H*ns*`M~|G}ZQAeq zXT*RZA*=(_o8Wh4+AaSFA1bKEM2&ZwVi|mi9yaR-;fR;Qm`A%Ziolv|duG}PB8DI(>0+Bl3@@+uTN7n#~Zf)0jIumNo$DNO~H zOY8!l_B1=aE4+7dXJyKhA{IDsQ$iT~yt<|G-|UVl8HY}4pAt(sAQu&_6V;CocNYj& z8w9rI32q8kK&RHsVBTd?Vj0BenM{n(I{mtT~?r`lHP(TV}VAdxgev&;;KgA^rg%^7hE0(3@5GViq^V7q^_W( ze*6?#;YaY1py1_xrm}@Cy56caK9^XkKVsQrC}2@`nOuLoW(8#>)@_ov$REqz&>#Q8 zf^a!paURQ92^ttIT9@FS)P}TMwm5>7(CpD~v@UEziIsXwn+7-iOPK4Xw*Utvn@nvi zC3B+nvk^c}>E(0VAp$PJ^lVYzXU+;qD1AkWCY-(yLo3@glwB+LH|?tL@f=!Hn*Qg> zvix&oG-tMx-d{k#jjcf*Y?9(qoQkse9Zuhb6Oh}0x#79BVzw;z(8na_46{-ILavu( z>z`8q=?V=k5T5n(*IMB{mQmJa(c+gU*ka`97ErrKzN%~lfzxJ2XL;)%QL0d*Oz(3G zs6QTS_`EJEsh5D3HKq0PvCmrp7`YdyQLvwlJQOWWZ95x(6gEl#xCvzNc8Pkz;d^)|qlI5d+qS9nEeL+Nw4U2nw-!j#XN zT0(0PaQM86h0hB{-v_jhhyRc_0tTjus|Poq7a%R|873)WME7N3`)y4Rv}MvVqVny1 z222}!`1i5Azb*-*2B<|XvYm1*`NQ$6fI#?uzO@l+Q4A8r@S(4IDz3;TYU}M-(6)vn zZZXQ%Z7m~UPVRFHph7QQwCL?<%S?aU+!G`a_VB{TAXnf;FF%))npPs+^egH8FzXcG zgMi$kE`EQ8wkCh0-7@#@)&1xS+pf3uK9?XNx3gs0A+14eVG+U2e{9`FC9urO!k-K2 z?YgbYJ!{nzU27;6!+&oi>^5URQ%Vdx7sQl%1^Zk=!g7T(sX>2>ByYusi=V5s?YMvr zxWY8FNp02ecOC_I^^?h#(V*_z?tX|jFU7dQiwU6HgS2%4&@@rhYb+)!G|?C_K=?zTP^MaL)ck>O4 zsB`KfTy+*}>cw4)sNlVaaN%AA^#k3)<|(7?EK9)h3e2X0x|b0P!-_Ekee5mWbtG`! zTF6%qR*plIn5~5%uT%+z*S%Mou>Kazl(TPDW4VCebwVAw1m+3k!4TokH@zfN()2mo zVQS=4hhP4G;GBV_F`?`R%`v04(JfG$qDUjRU^JZuacgt6S~Hb|sVPd#b)SB#q2)fW zKi?TtH9Ii5IWSC!r_Sy#n>gIz_bO}O&-at3WHk95rtD%RIW_GSz!7^8g<%$fd@3Xe zVpygE`NKU{t@CEwsRam#h(c=G05@TjaAFD=M8R|~wTV7e2->3xn&4fdI9ugeS(+CN zB*@sp@;NS4%H&?TFRP5# z9Cd+Voepbr+1{Z1duXCr86USZO#i6VmRotiMW#U2+nZjKCJFtiPj~mvINJ(_dF*^c zREVUzH{Ax~)tR$;#k=^mnF(r8iWtr2`HNYsSky|a^}qg8-sW1gX}tn~`g;NaVWJ4+ z#)yc@t8+izUnZ$WCd0*}#Xa*|1mV8Lt);e?+f!Tn4Bd$p{*IoOg~@>ex^F(;b}!7S zeD=o-DRU!&!F?7fju+OVSZ()IFv~h>2mlZ^z+{j{OY!|O&_!vM;f!i4=-=oOh zhF~z$)c@?iNifLykZ5fHnZGZdLPEHb%GREtZX#jzJJg~f+83z_-9LpMQ$O+iR0rHN438YgkbW1hUgJ^ zWhg8s5dcImrxOai4H*EIUW$B@ zgfOaUB)ZDBR#C91kq`vGV?zUi@cHX4Sh$}JbH1#fqdlxB%%aQ{wGiJYG(&^SvtR^c zNlhL=+43UKpcggMX~Rrj4kWi6h6n`L~;1lT=$FTq>2nA|~-;{$L7#~_%$ zt+4L{MXq4!3?p*P6kJ+S_5oIQF~sOigq{8tp;1-!j#508!;m*?alFPGWZ9lo=uq%L8mx(Ddb00t11oy#ask0uY-`O0MvsRn^O5%R`0&# z1}Q_5f8$^I3d5FCsn-yNH>=c}HJPeYLJvihsniMacghj*99^o-ZdO(C``nFhh2*V8 zg64gc$cJomZxT#of?#Kl7pm=Jijlj1`o9Ekk*|1DloM<-RS`X<3bp7zXS3O@exgqB zs!m58l6kILzZF#e4npv?{zN}p*x@ReWbU#SuYVyZ{yF;TwyA1h&su#Cg0rRG2|W&w zt=DN-%z4Aw4}QZ$@21T;k(PS@t$z)*QsUGR|B|0P{p`b31MPu9M2{c)%zilYL~l|z zU88~x^rCN7Jc@#hCLY5N|e?1pnfgxDgYBPq(rm-%`j zjLc8)!aCqkp$LCN?lpSxo*15$O{%HLBkGjJ2r~x>hvTOBfE& z*7@5Ah!ZXD`>yg6?AxiM2?4x_PRy_vlgI)$Ao53_3XUvzwfHe=98ljojc2pJWXOgi zV@kw=1jo*eL00$Rf*%DmC4%<;A+{>lQ6~8!csC_&U69@s#)@ZCFCcaZ<*^#MrOFU* z4_`)m0-Pdl<%Am$4>0i7av=e=x_$vbkwyPeUIWx9>vvVXHn7jkkhn(Cv%rqf%S-yz zKIga7>gqdIck>7Es9pL%q>7qwsXOsPSSBJHB61iM?}d1(z7P46oby0Xp){;H)s{j* z!Ro5(51BG@0_cvM=U@K`k%Wa{q?BQWL)I-z!TR%){4h)!Qv|)e6@x%(wa+~^kXlHw zKOWKa**SE|dq#AmDH8_IKwATUHZs$jvWarGAlN+jndG^sC9`J>(Haz1u6hYUgL92E zqm>c#bpQ1d=x@(wO`_&Lj7cr2zlOFF!nz{dbS#0}MrhCq@?6j&B4rg}!7#ni@Vqs^ zW0L}2^%RG6)GsRp*Ql7Jk|`_O;0g+TE`Pgp9T737pdj_CO(Ldj_4o7ix&923gGBFH zuF61Cxn1*o%g?&ge6^Z^4hP zsow2d#>xAv93`-{g(39(oxjH>IX#~x;3&Ox(4aA&Z&5@1>Qm&E8(~TV0EGGbclOdK zJ8aO%zmYLoLo7fhg0!bQWhqvjwLRvow73L?)E7pRApsQ-papU(Z%G&zF37)x385wY{kCHA zKB5BxR|kOTD`6jGj@Ovv(`@VF_l{%sCMln>iuK8}a2Qwyep2HU=uV zf4K)=c1_4VnX;U|tAgo1><4C}5Fi}DxS^zHe zrtkUdKnZF(NPw`1h7bUIwk?kHp|`KnN$8}v{&`i6Fk7|roy_AJ>H7q!)%YN(Sku^E z*~e1?V;9%u(lX`bA916$OePi7R&Fsgu`ElstR1Og@3UZgKHE37wb*AFhy$LlDp`t) z;=q;j7FM+eQndi64P+Fo-+gHuhgc$I1y|bR3W8~1S>cKsX(7{%jcA5=KE3S0_7)TWTU|0oI}DMN zy5RJ^BRZ8IKtM#%rLi}eW|jVCA|zN!pizdbDaX5kC3c0-nd-k%XgU~@U?rJg+exl>T<{`)nfWVWlM!R<{y?SZLtXeDQ(Sl9Q|N^ zleba=LFVNGJHeYmfD8h3>b3o|vr%2MtMifmHYl;A{N0;%$e6mC*Xdi<+N?}PrzoTt z&*_ex?5IgVc+*M$a)eBMYPvwL3*ebrC&H+!3a`vBsg@a*A=E!i4Fmle!#DFpcBFX> zP^;2q*qq+2544`tQ@J}U!Q-rPNY$^Gl2!?ezg0w79>(&B`(RTG@=+R{IcwQ8Rdzi; z^uTP2D2zK` zUKZlBAbDlvFBimr>3`q3O^o!isR8b(1yW1?{x^o_@4UTg_kae2)I9s^^Y9*#-wGk0 zzcO-kd$HOFQ&W5L{>Aq@y}BKruX+=uc+nV{wFF}_THt)!AtPD0c{|DT5_J#5JvY5f z-Lqk8<=~7M4&;DotnkBbTC8tWf51`~UlYLVRML%bX4N@KR;~LRr?9>80TBdGz+4A!C{MW^aH6PE}R19d&hzs!NLhWAD3E?hOlEd`PDY^@bW3z`aL2WNg~kgR;{{oKdX*|VT&5l%ab;#yR47L!i0 z52I|2=B#R{=_J{+QiAmE2^T`dA!#?r0XUy;gCQ(Dm#=EgGHbNL_B~c;SXTYRYU(mn zYM^DUd|naRvt{&KCR|hVxn-1H;EFxg`5cdRXb%f|)6+rX~R&l7(V7$0Yv@lf7ENX@82Ff{nh#3{zq5% z`K;^WtvTP%t(;rQFQ^xW`!lWs(#0*?>Sc~|pMcBslz%yH23qzIv9ys(_YCwquqprz zmaj;JzYLgrYr=}bgX?2Jndb`elfVU^0pWeJWDV;L3HszelYucT#6$#ORNmIY1Rz^V zt#r9nF&zb2Atft%RgUHi3)uqeR#sfXRlG72F&gyulNJBr=bKfnw%|%F^wZ}ft_GjW zy0v)T@HicBT|A~iKmoDkErcc$a{WNYo?ELLutrzR=MSR3PwKK0)4ep5G_ol`^`dTt zKfP4JJ%4`&Rd2#`1yhl^fc&{K(mM~ZasXXGqQ6qC0f?UMMS5|+<04WcxLsb21oQo? zufP#pT8l2DxR}SFvVBq5-k@(SRO`=f2(UBI`JQ#2jeg=``;1X8*VLvW+|NstLl6l7 z$>yF*3d{bjorF9e zZojQzMI*vQ@$aF-)LUR`t-7`P`J@S^+6ExYhc^*=K(4HSaBUOwOT-&i-IDuHIU5Sl z^Jjd%3-!7;bX^t$^hkeChL!WzsQGQwNz&y208QZiz!4pP?%7}tqr|_gNR<6vE&jRb z1yd{ZI$Bv_dCnAu0`~Je`nPI`@E-32n_%4nr>qdXs!VeulCMA<`#x{+e6?cEq!zjj za}R^9z-5q#vOUk6^Vj;n{>NK3Br8)iVPXqf^c1wf7HSA)A=(V2fZJMPghVFMTU)cF ze%Zjs_H$XozU991xga>7D-fZPR2Z2)@FaXLclbw;%2BrXM3o5aYvR-s#2HQ3B&bcNdQ|4dke2-U_!dpi7nCM-cSL6?(W-66{-UFEl z_8^&3WF%0?5YTFsEpD6qzyFsFima+%?1J7>zgbu*lWp0SDxG?%WVhtI_=xK`=)k9+Av&^|D;~x0= z7N;29MpBE%f#skkb!O!9vz&HC_q=Fj`qjB+nS|bR@oZiB8KMSmATDDoTvCi)L0k73 z3bI^pI`*I6&qV#!3L)pYMe?RF#RxI2^oDU`EAzQ&&>Ej5Qv65as|3;39{owKNrXjky(;!6t2F(Q6tr^%HQur&DoH3E#ANF0l_81fc*zH{ZCz7 zPHh7KEG?2>&zrVSwgTFf`hNB-674SmR9~j-XMv!WWCGnf9;9w#&i-3g!v(@&fRzAP zD~IXMH1v?Qz*}=@_^*>v@2>gmx>#1>zd)uYt84oIIpu_=Rrvfzb0Fe3f*g9=yM3A) zre)z0i$RfInDnZHnf=|-<<%|F6n3W$^C%lLsN~yoSG1l3L;u~_8 zw6iD3Hq3BlGJjbfLOQPS?354y!soXZ3?Q;54?d4-Em+kE=K4zcGaUJIrGAB&A4`4` zqT|v~L}ND_SM51@Usb2)y$Ke#-hXcOAE!=*Q}}9G#^^Vt!Z?QUz&!umR+X}_vOcOS zBEbSJCeQjT8Y4kla;LKSbYYJG`X~RI2WkMQ&orD>`fhh?v2o4iqVVv5^Wr$SK2Hmq z?z_vEn0@?~ZV5$)6=H@KD(wKqt}_<2=UYjp1~|PkzfY?YmuNQFJk#& zZ+0&VCRr$f=-&d!$jIr+vspt~|3O-RrGhXleE`I!aF1rG#}r!nw{)6}xY=I2jHSK} zZ857RdGSrF>e8y64{po%vWSa1+K}F0d%GW%t`gSE0Ult-yUEVt9y2Ii)PIF{+ya0^ z0S`nIC^^xw^_)9 z7Jy?s)}_z3r`Akx)|GMAs{qxAmmsNY@-#WfoZP?aEomfE_Uvo2FgW0w$%x;nDg)v1 z0k$&z7YdUvd={WRlTdp|ZV%+pfP7yO(Xs;K&4AuC5fQsnwLqQ*&V1-f+VYF8GH&O| z%bP=q^jXRaEq)3L+&^B47Ub5ev9cfk2v2kew;Rl;W zQAvTLFsi}oq(andR3<07rN}d* z5P`;U8w)OJ*Ryo!LV25g`=%oyO7+^;7j}C=2MAY$nEdNMaqoGvA70Y6Me8SIn0v5r z>byOK{^<&wwnoefa@sUy;Y##9EBd*va6A8S@8bH&5TW+T_uN)SG7;{_>ry;_qb=9q zCsfnV1&Q{hz_5n?}I@82S9Ew=P-sv zceVp7X-~rygK&L6(*d{n?G$MHxfQMUnLspx6mGdVK8}i$e~5*w`>_rEmV}G({-d!X zQh$DWLA{Nh-pk>#;zs8(?p z;0z&-DQM4&=lfh7UO^)Z=Z)NO?qO(QOLoz5-{<@I`*q@z2u|r1A}kf8e2a9MjD5TW z)LBGkX_z1~qG&4pk_2d`&HH%T(nU|@w=JNjHvasc$dJU8vgIH-1`*V^p<97Vj;84P z@oj3|CSP`x5UX?Z&-)JcMC5yx+4|MtfBid7r%WR~WtVHKGUu;QMgS7qlk>LAZg3=C zDzMvBQoIKtUv0Y#B+YTmG*s@8ZMcGo2sQDquz>)%A%JYre11MTepA0VQ;+@dT1abz&V!+ z;peqV1NtnW+Xxfz_le||&$s`>SS1!NV5w0c5TtD)C}7U)Cy7{_SU$RPP>6pC34Gp4 zD7=-v#c3i=Tlm=-ipi{W1Drq{qKB>^iGVJEb?3>NDB7=IA0SDOgY5z z7*n_lzg9cnm)Nj-U@BiKsD^Y zW-R#gJ;@Zg6o;u``d|OaZ(+Pqg_!zwT(mn|Lm>x~LfJ7*lp*H?u{32%XugSDT$;rx z%S7JKw&609hPStw5w$)Tlq1?i`_}z~@1Mi1`@u$mAJ0ib$gnuvs{6rg$2dj+a6Yf% zL+n1o5aApy0CAW$e?nPAaLmu>9@vBeI0we|<+Co0wDRV>6pbeaR;XJD(x3tJd`b?* z8U}O(hw&Nq6!W%zr3$c7^I7Yya;<=zENah(4Q#J)Be7n7-jv~Ip}lRKfDy9F=osJ@ zhwZIw|2~k2xcIUDpU^V^`)!7E^mg`G-o`vb7>v=*dGV~0pkEwb(QO#0gw&b_xuR={ zvTK?lC%vC^bV@t9vS_Mxq(5yvbdU(mlUz*QYNmR1#cB@p4%HS{gy^pq!XG&VA9*%p9sG^ zHd3BH-dH3?p+a{qQQ=!>!7U)%=h8yI`*2)Ze-fHo&@26V{Cj?K{i1h1c|Uuf%@h34 zV(3&x3hP^2+hG3n@8Q75E(FRC&zJiuy62XI&uB=qOd*56S3gfviO8%~DXNl_ zlOxs$dxis`_Y9_!3eJMT`&rq=q!#thqAMlOmw?_wBR}9#(Le5_41w^kn-JSvr(veR z?=#w#v1RZcS^sR#F4vw*#bS^-Oy30hy^sp&C|sPl|DLTi*!Gi6Eh>f(CQK2Ipb3A` zu5u8ISovhNm7mX%4s^*H#TC>An`3Ivm9Qzxnj+zPfzDF3Ih&vPQ+PkgDmZ`YQGMGr zPqwxeMaK68=PFADU@&ZO=r%xcjm}Tgj1SGwV8m6@yi@rBL%N6kR($dc_XkV&57=zs z{2d4@aI_4Bq%1_SreuU$d-k{REI|bvw7CkaUs@|&X^ANmA`{z&5p4-HkX3Kp03rf} zQBb^R2&eILFsjDA`iY0jTJ_TulJK9=P8Mz|NxPS!rP-5w#^e$VO>WgVn&b0oo7@7N z!fHwAEzxrRd9U>wiskQMxf(_Q@lz$!6Di&{YnTYs?PNtIx3wLFoOb^`Ofdh3rO=kJ zu+8Wo&S*5X0Z*+WxoVPk7y4$xFmz()q8OR|xVnKxZu!MBFEMDmRnJfK(E>v(={{(( zQ;QhOV2F{uX-0vIsn!xlQsg0jPtkl4x^6__Yksp^P72znFuV5nHTR@Y48KdMMd$0; z?1#BoX?4L1Ikou)7okw=E|ss);{nBRSw-Z##quQ3ol|X!^xnrHvUkn^)!*e1*VnbY z8t@G`d#hl=DisWJgCNxk^&OjBHjf6}LALr%>q6@SG;3lvWj|TmKPQlWt1{AC<4Oe6 z@TK>ZclHzaKTNDU7o}$3M9ATiq8DcMvo(4etEiF`@dpru+Ux7u1(qcrO!0zUxi# z1|bXKEtLfc0Lu&P-pX?$?1C$&ENStep)Dm16SYCIA=~S!qQ>?)h5G=r-lWqEC33Wx zv7%i3LN%{SRN7MH^fN-;@cYezrG;<7CF0E*V3;9L-fR>*H*PAeC zuQ&=z>p-J*mG>_QTP;TNNEVsL8XI`KZGOe1V|v*p-*k9f-=n4ty>fJ)6eT~{EF(76 zrj>Z({;?5)yUqrh>nxfZ95N0;!)Xd1D&Ui7yjp9#$4SC}+k}QUMvjZ&Yg+Ls5`8M6 zk)LhlwhoGA2^yL~PFPH32gqwCh3`=$)I=6fW|&QEEgs92yzgC@;mJKu6Nt>Mle9s< z9?IZhzVQ?TqCV)F(_JuGCe1pbO-_P{ke(RTtkY2Mc8p~b>4rv+E1CZTrZ4>9uwePTWMBVw2sFP(}y<1kP z`@@BTfH?H2OacF4$1p$Mgw95I%OX%|Wa?pQYRuME9-LJ@-z6)!6*u_6PP!3I5D1}N zCRnYSgM;;vbf%Ixf1B;yrAPHzUbf0W(MCa#Q51f>^Zrvg{bL`yb#!#-*f>N{sWVCV z_Xp4m;L7mHTO{#)J*tzY>F?2YmWew6)xgfw5_}6&Vu|MoK+hm`g^Ono5{2n5XpN8+ z>SCJ@j(!nm);ORL^3TH}(IbdGTW~8n`4YF0j%v<8gUHXfU`>b{QpA~{LV$HLG!4Y3 ze(C^OZ2Z=<5knQGD(A=I?%jBCxx#8etq>L|g{Xi1yQHQ~-KxaT_bh4(?&b6q_yd+M z!}XpO>bj`6N3F5MEEm0Jm!ipk&|oj!w0_xihtMc2XGH)pS%cF3`3QQ|P9Xoc{&&2k zA2V6~^U$`*w}Bt%+2kPfHt&!8(~1(8A+AsC!hA3L#;H7UpwI}#AzX}ZO~J}}RkzZc zh9~WS(CQbUbC}1|V9YR@nu_XejY8&^!+ib<*uT-=cc*Y$uIz8K|3FsKH?7S1LCt|~ z2U;tr`moe;M6-qI7;ZwKz;;vFQ-7u&qIH-68Foj^SrrgZ6fc8)`x-?$`(JeAvBQ z$gb3u1sDMRY*2$*G**T}TZWYb$IuyY8sSR1jLZJ~HOeU8A7M1h>yvU=3l0=NyQ=p| zTs@ekQ<%ebz9Y$PxK{LZr5mGPkmX;ylB|#M$#L{e8wyEE;>Jzg-3xBrG$S!)0NADtmx-!tE`i z%p81KoU^I#pA)*TT$aHaNbhm}2{hK-69^9LXFkjRzE*@K$@hfeHg4VDf~#86pp}1o z9RAp$ra8~JO?sXbE}3Vxmt?rQKO)(G4g8d1u&3wPt~vL(Uf1*Xpn?!@6$yiMY82qd z^>K0qC8eRYy)Z>wF?cpU8HUcmfFx)#|L88ETz+WI*mg{0`jZSsnsusM6bqXpATPW?btZI5V(o5!(qDNXip)UcvKI*y_>ats%d zh}ix@&@Vn8Bmx)g7X|!+n70w0SwYE}==Y}+(Qjg^;(i6Bjhey~969P+NZEOOYdX)j zlcW&6Eqcxn!1GexzvWxP^rq(7)cgEBf+H)gEz>>y{mkz}se~E^1$}D{En^QHE3!gi zYYTR9mAd7$>n)mFsaR34jRcYV?YPz@82KOkZ=m6`;eMWX$U$mzFod@Ix>nKcp?q#Z zBfes<%>K4A6tlI;KFCJ3j9hb^^yM}{4GYe+CZrPKme!FC8>m@+=!_Np{QH2L7|Xk+%K^lL{9ZaHh_tbi^NRzd<_wkFE9_H?`Qd&kAYh-X%o9+7 zHVA1URF>3~!2kmB*X2r*gHa6`)YhhB4NO*Gq&N^zV-aFtuDD^%v%MT4C@V-`jqua0 zK}74bIk|lJBler3i<#a+^6S$CXxkj{yxsZnZbcZ)R{%$v(!^K1q2 z5|4P1-|~f!T=KEdY8Q>FZ!5&Q7Ul`PKmC|2o%C@jIo5KX1QW5KU8)!>;n2aRC6?#v zpA6E?N5E>3Pw_1x@fe+`c`iXyIty33SNF67@^>0boZUjt_%4H?87h z5hUf=F4Q1_r~EApyc}@HZayl&Y-aE$!)gv@eBq9Y!U48c;jdF6Q(>_f4>Ay4`3N9A zYeADkXZI4=ECLJDCCN78KnJyVan-% zKPZr9pK1)zKl~5@fo@QxCxG#vv!r~o0dR53ISMW=dvjsY0n+sc;CkLfK>@0(&K!62dJm&{V}R_O%tq%#s9a{$O=c$W;n8BC`TZ223-;e+S8iB-7~mn26wAXIlX3ZYXYKE zAo=NwhOpZT5J(MjWbE&n2hPiR_u928GoFa7dh{g!z zka`oHmSI%a&wA-T3XD$1v#f?fF>VzB5BGY|_`Ri#$-w=#(bgssaSM;2B`~M>J0{@^ z&jo-F`J#Xl?!KO$fe(0)ojVx{z=&o-stYWu{3=QY!6S}d(M3GgO>P!zx&8(A1ezA zRUrfzobs2(oazp-W*A39pF8*?XK<(Iy0X{I+c)G9@hwI8nC3XS%_(kt-3}VZvBRb- zc0l6*nfB-NVY0NaHL7)>GOglyL^;L(0}MpBs*=dHSR4>M+(er^w7AQ1;1$RpnN26Z zPT>v?Hn3}UBfj`3i3cy!`Wq|`H>u>o{@W)e8(FRj;P2!0`%a12!T+P{oKAdlGmetQ zl-AvLZ4|O=RwM#3DPSY=B{dgz5JR1ulg)vSM=_>4C=|Jpq$Ck0Rc#`$my5es=Steq z_K!w0VTQRB2rsTu*OtC_eZlTs8qKZp$)*OVMYfy*P3$)2V$7BE>lp8SsevtWF|IIq z?Wn`qvA)NuZA_v!GWd)4Iq@?(_@zz{)|6P-)RF{jq(BBQ)?Yb9rL^#VE;6XHw79`;iR19b%-lD7JgIg;%UYnJuzLQU z-N2c{PW=iGd5f+Cjl=T{C1-y7qkos#!Ua|YJlpmN0kwC@J}kyrZFk_bJAPp8S#kJ5 zSba)CFAlrJL|v_ku(^@nxCl1QS5o2QLOpR1M4Ak|Lm*go)y}X3AVnF>tt(~EML7b~ zySYN3eW-XbW%oa^%FY(#-R7!u;_RyU4<1$ z1(|SBwWSn-ybrZbf<<1|zy3pKg%H!14HKBhIDb?Ru~MQ%1m?#!mM1^-uK75@M(+AM zYZk&qMV`l6xf8@`*i~C^uf@d!ent1zp6nJCT}5SNAG_us#>gnwS)@} zsaFb&Bd=CNF|PA>{5}Zs`iyizf!Qpb`tu@>Qz`Kn4kB|DKo%!d>`~xg;zgDOGLB=u3l}= z3M+O(nAi?k-;RaUAcbgpQ2%_KKpwWCaL5{wv}-N>&Mk8PAPmHpB3w)V7ctErVH~A1 z+a#9U9~*&t`+kPpUycrC4izZoPfF*RXhZbM5OIZ?9+}kpNwSSC@}it}^kZD<_=hN6 zxKw%|a&k3&QuCyERy``mrjDu>v{ z{`<^uazg?Y)PlXlQdux6xQuWZLU1iRC>IZmk2jeL-&;c{Hd2+Oj4>iB)!1}lb`*4# zcz*}-r_*;>0_HNUHKBQHdiAixiqR_K1iZ|_0(Nk5(buVh8Eh*-_AETykiEOT{^`uq zc<={*&Ta-sQ=Hqu)}8c0G{iM9=w9RSCt41X5EBN&A&RI1xxg}{vWOH6TJ)br3yNC% z8F&n{*?hmoH+z*xnesU~mWdKR7(+fLs+_2_??R0Rpf|m6(6SqW`q_7B1R{9wYEm|e z+u+0mR7)Ki0-!ak(xo{DQoGH2zKYY7y}=-t*xlHwjY^P=G zS~Xq{j>B01wu~O8s;WZc#Pc(&nc~^cj)32U=!Z^%*ui@st2Q*Mb+H=We2ShpX+q^f z#P7<0WgX`WMk%RyCza;LH<&S)AW2de(_^n}4SJ;RZ1w;3eB9!#1Iq9I>%TpN-k_8z z*QS91qyTWgy`PJ^N!QjZqGH7i{kf@;^CRs26`>%IQoH$FLvnee6z)TMNQKr0?fMx& zT3Pd8rNJs`Ioh;??G=O2)Nh;S76vSgm>Q(DNNr&e$b0TH0_bqRYxLBRs~||jX`AJs zt#1jil5_wQ!klkWLqSWcE`VORT(*?0KpXLPt+xqsLEQ$lh_VHcx&0euB7E!)mV`n6 zW5xXV?>s}R7OCkuBmBA)B_mndJqER=YL2h=EGs6LWJ0*O^*e|esM>u0u6liNDQ`0_ zaLFyKiGp#yRNAJZTa_VPJ>sN-GBCpZgj?HJl_)Sd>>>IsA^bI0$o+mr*%eKX0s~dd z)PR*=ajMY$JxjuVhKRO@OCBSA@2M?}YPY|CqbV@YU-wPDsu=pmdU+|`!@#v%l!ojD zwqOleyoFzJ%t^oc*|%HZ{IUe1fk_!mzYl#?mTTH6GkNezq2`IVdigzJ z*|kaeWC*J6p6yL4gq2<*``if7p3x}GiVR#b0cRfs2?c$hW%}|ym1uxfj)Ccp5jnJc zZu9TNwa+rC&@_}(w&7?>nPVD2!VoC)pZLXA$_~#=6P$lDg#<{b)n$8A`uTQKZ%3D9 zoyPobbihK;HaPSKEDKxQf}X(53|{c8A32=Zr;T#HD8|X}wa47`CaVQFlV61L=_pt@ zC@CruRoz5Hj)_DIbWmH+LiWNCx=;tACQ$mcT+^7~Ad5;2S4OQZUe!Z;!=gx^VHkG@xm){xbwm_EnSLtk_22m1|I8Y zuJnon%9|m|;2NKD*QOJ2Nf%B5QH~61ZzJW}Sgrr#e&2cCr`Ix?(GxB=;jcsN^ZaN% z7Mla!t-WOZqDNcL0LC3Z_4;RQm*ye?2`OyxVE>qe+(gfdt?jV35T+^Y{VuFF0n{ZI zn=2pv*80%P+c>B^MfjA-h_G$qlF#e(>2S*ML$5y@Q&_7ye@PU4 z(~iLgeYqGI#H8R_zC|iiJRQw(IJoG?0T{^a0eRt%=L?8>&@rvt&2L(h)OxprdMC%E z5ttx+y&ECL%VUBpLC+s_a=dglA*f1wRKxiB*MCcyWxSuMImz=CH;sCg)IVNpU!=^; zse!Bv!7|7#k4hH}yYfpTMP}+sy-{M-rp(mJ_@@+yc2w`*4nE#35Yzx9XwbYT7P-7f zMj}YBYBVQWg#7|M7v)+e9V2A`J^p+Ov?A9kK;MYdfMcN(EfgQeU9VZ2XTm~%C7aNo zO&5^bFhNhk=k~@dKtqH-asWj8ILk9N$NhIS+MZ{vd${-gc2Y5d&V>N~LSSrNY=X~Vnm zS-QZ$ppOHDsqYgk`*wn};g)^!lWMvRTFj^t16mo7GC%!DhFdc0!el3}hCLxU9Zp}SXrxB$45KOS8GV(a1}1eb*-c!KqjeZIjUuE-#J@hS)Q8T zI5?5{Z1}JL{0e8XI1u)-L)Wk@n%)L#z=&AUu(U~$m@?z80Zb-1R|JCnr3xtB6Kha$ z=@K#&jgBoB^0rNcF33Qh7ZA#;dVBX-)WU^O^P{1fW13s8tPgK#h1BG%d)e1g7~_1a zHZ{L5^3mVlPK7Sq0umkzZvjzF%G}Qv{gd9j4Q3upg;vf`qb<-}JYR;-4VAjeeAglJ z`N|~f?df%cAzfENzzP_Xb8OKy&uF2Di-0DzXpGOA!VgbR>r&Kmsy*8<+c~&3_46E9 zK`dYjJm1J^WaYD#*#Y*aE(SD(Xxu@1`W`NpWC~YcDqL~onloV1`Vr-5*qM`@YYou| zgDn5Pl7H%(+gv5YZ!;w6%G4CJaVx>r2bJ`jpxXeD8j8d7qIfHK4{E?sDcHWG5+bAl5GKee#b;K8rotu2Sg71{OD00p zg@f!W%hNtyk{Cta#qAb zKv?UP?(ww#ju2pJ^!v94a1v9OwXk|vh(E8aOgkbHFm!XbxrVq(b5l!BTbSm*{!>nD z^Sr&-#<|*5D*lxoBpqSe_wzjn?Q$En;qPHB3No~x)q5_Kp5DTsCQxe6?aW!~``L4g ztEqq^T9rM2LSsAAT!8koj(}Rw{=gwH(h5i~bpf@b_aD8m&qAR&&-+`_TXQZ{lM}r> z-;>mk{=BIuNq*S4gO$9WC4)`-dBmP{fPZR7XzF+I>FO_6rb4@7pqc4}wn&%zeRe1y zimj?|K?&ugm;X3-OMo$fO7M#gR_got`)tE5iwRt|DCf7l^G_05^W5#iq%-8N<X4`Rt2DXLOJ)4gBDrKZhi@;fjGWc&b=8`X5h#I(jL=tU#1Q0)x3-bU zWuIJGyhRXZAeeAt0yWo0({!L=HJZ5RlHo3FmIViZZ^F+t(F(V z&BI6Fuzv*&MaCXiiL=7<;!j&ZUoze;pDu_TSQx^e=+!J3 ze*aV!1mh^hAGYGqCjKf>1>>y168#I){Wq>+W3n|=MTAxQV9O3O32VYvJ`xFHDO-CK zeHC_aAAc5eA=7a{ud+mCt##>6WKRzc_mAbJS$qIPQ?NQHHm78As(V|&APMPNKlPRY zQsI1o8xw^uiXygZY)w?58w{4JR2fGX^eHmNcZ;M^Tpo7E;m+h*q{EG+#k0$?T%r*4 z3BsaID8w6W^`z3J{5;j+#(OJ-$}F+F`vmJbmLNivfJGvMr?q9+YwtK?YuknXHVV51IGL??`}glHh@ zVJgRB-2l_;O7`blkePE5jr6#~P7f7EWa^lP7ihD6w|p3YgHwf|UCPRVSJ|tmE`~zY zcnKQ+>s*8NxZ#Fi>^c_)Z=&Tz^$B4##F}M@VY=BB;zxBZM0gidv-DO8|Ces7mQM^0 z_q~5u_dZSn^hFl?zYv?R`mAaXU;(@Q+~uciRZ3*f(EuDXuBdwT@!6#?%3I*iFC$+`HC5C^ywjN0@HcMoWO@%-%oQ;hd2Sx+!WCn7u(!u7kbF&QM=F0YMe2rUC&tc`JXAJM`EqaHG%@4r>WMf8bQh z**1mZ6~{XHs(|EkxqVW@_o@IfA4z{4>o7mS$Pi(D+cuSoDU|^e7Q6vy(r_115-c{k{Ouy_6s{|W3!Pd#w;ZMnvjf^py+=)U zP-c=$qNd;3n+9k4Itnt-bXyOP=qjcMCSSzD53!oF;8DD`_?i8B#rS+O3sRlR~s(2hB0FnGBWGfDJBH;ZqT)(kK~!+ zS%lkn-zy4RG`O#>f?fzkpi(o)V;&bZn2`kQ7Oxdc>>{QrW+ zP0a?txVJ=pAg1mC!p8haUge4)oZ9dzMJMXuM`9OiY7QUIhl$uC;^DC%SQOz)N8V zC3W~Xr6uc_jO#bJ63?ee#;a!zC}rg$uFe7)UoaBr4FY+og>j-LEnv8}L7f#ALh3<^ zbKQ|@FPU{ggE>~UTvVf)7g;m9XrXo?dNQ?2HS~Kx!7Q*3UV+YVvCELlm`1ZCxf-ib zMrS5Gzte~mWyk#hXV(M_b}XUTH3Om?3YlF^DqM7h)~-i2CKmqPE-{UXc4v31#kFYL zvtMw5PqlGiSbbS!65TDv$Mrx}#|zB!g~vc(L~d2Ed8~oNvgOa1xodzc@9!O^CyQS< znkqeJls(KwK%@?psn~42Oq-61$UH7P^5f#f#XUjwouze9u-w4I!pBsYSu5ML^JW$uxep+P05>aJ)jU{8L#=`@^`Odp#FLJ(I< zMM3@aVw}?L)mD`LJT&1HcMokg7HM7~weoFQl~-rsoGf+OyN~~W|EIeRZW!e}{QSDm zgK&O68L*AQVAbAA&F2CLK;*uE|0Fk-$!GPGt$-mKiczWe@rq11y{z?NV*U^dx1!w! z6J8x_^wXaJe-aqw9wlv^6XTo+g7wp+JWXL|M=qcyDeWG z3-1${Sda?R>u67u_j$YTkGr_6EwD294HEy(7T}<^M{dSi>Fw$Kmc+qTYI1B)aDQ7{ zVSr8QOJ8yUTPUNLIRE)Y(nK3%wkBigdG@J@9#@`eCYz6yMx1O^8zWM#ZZ6X#RVsEHmat% z#gKz}y#5aBUR4ss3u12pi$slKeHWEXOF`j-u*3h&9~s?3Y)?Ij3OpfTq*IKW`A*X1 zpG3I<8!?v8oGCWj1o($2WCcjU8;Ee02{VdPf!xH*C3SKdrq*2n*n8mGVhD;|SlC;F zfUI^I6V`Un_8TC(&Z)9(t&-vTyLY+m7>!Q?O5Eu^DG|@1#PW7uYK>iNPWDd8Vcsl) z|DLupq!Jq!qxZ;I^f#_mdmX=GcE2fe5l1sDwrJL7%JAdm&Zrp#0` z`LMSz;i95=MWqP^I`3i$We=ucS3|Qu+$S;&{Ca;sK+eJ?5VjA!Z4f37$zCtOMb&0< zrB)k>Ak;oqF9!~Gb)HJ8J;Gy_5yu0(^3vX%i4hDoC&fA2dXj;=(gp6zJxCZ&DMax^Zl^$S{mn1)&8fj< z0fguphZ9r*-Gx03hEB`QNW{(r)Dh$aRdnlDx>8c53@-oS*$c|!``beZ_3OrG0`MpT z4pi5LL$O0@Nn<2mM!v)2&Q5jo? zS0yIgjQE;Tqn55T=m2*3K0xenMFxNW`i}&MidDe$qV1?KGU_dEL@nVb?%7H?u4qYX zsIpTb5TkE#QQL6nc8Js>oot0kj zGv_^?2P-rG$gm3N^HFPlUAv1G30_7)Veg}h6xuKsM2{FpXN>x`Yb6Bx#3_eM2`28x zy23rb%;4F&IX{ORaPf+`wUNMVdyky=km07H{DM27d+AE8oHIFD1q64iJ(T;F3$EXj zS%Q^02BwC+vN&B=#vA8=^QJ?N;IMBYCppD|iX zki6%%D-1NmJOTM-Z!Zv6?eVa$fVx44Ttw8j`3rpaV$l{7%gnK%$Z*+hS#X5=S)ZY7 zovT}dg1FWS8_AC<;SBI^25$8Q;LET5joTYsa_dGs%igZtY|h#+Q8A_r$y9M|QdJFE zD0szPz-2Q>$T3A0Yb)Ha1J89TyV&By(<{j=qo~cS=ucO+x2v-Mn%0hJB0rTl%NGY=M6C*%TGt{W7r-$>f#eJk`{XJTC}hze*g2HUr{y_G*^x#)47 zC?w8`4M)6xKL<}Z|Me{PKG1A=6{oicQGa>2qLVRSh^k+w5Ns)ffO1AihaWT)UuW?{ zbjZ8}mMcX+C~)B56L0qEA{>R1RACCm+3?D+P`II<^N#wLa8R_yPfwY%3Vg*@+NCI4 z)9k7Erda7JG%DQnYL9l>S556}%DC}^wN|Q=R+s>xj|GOC)19}1?jL%=_yzm5t~ARx z_KL4cK{KA-HNn9FyIXMVoF`%l6)zs=4rjDAWo)kPn8QMYr>Qs=3-_(tD7<5hg7RQe zPkKt^r>q;vh2}_1;g<$V1Fbg&Fb zaPZ82eM4H|<}kbs;5LBJLQTD&uPVjSQp@@t_egMMS78b;RJ)&(cczer~G&B)M)y^IUC|9=iK#inYI`wLP7>!XCg3unmnu_kjM_|Jk2~ zHmcP~u>#C7w=LZ&2TWXh-ss%omSviAI7&JoTx)56y|)(9Efg{kFrOq21t28sDkmx7 z;xl_}O|P^p4A5SZ#&4;ShK$<$==rPCJ-E*hTH$Rd(MCRrhBV!KzVsnRIkuL(R>1Fx?2_|CcQ*dv`Ap8Y(&3;j zBYnjw5Zihc`6s0b*Kd%c7oz2wpYtJ2di(ZB=h7)#3bq|sb zv1JNF=}I0wS%lymL{sBFFX5ug0q40AGgXt$a)r7zf>QVKJ$j`dS3@F^Y zRt(a9fA6+j^e{LP@zpG_xxOjX-E0K_YCx60KM&?%F%jY*a{8AZVhJ)&4w2<4lTs0t z_EY~jri;jX?N&OdouEJ30~YFN$;$r3;K^C%sd%K4k=l0cVyY*im53sDy##^s4ijQ= zSS#2-xD2=6!pR`OY#(Tbe-)(qO)wU*$_Nj^u0*6*_1XoOsM2DOIDOHxR&}2JZPm)R z1G10{3W5=&6JT~RNz`CQ%GpC6+`F&=+!Hx_N_nQZtC&mY8-%3x2D)KkFA>IZvcU8R zagG7K+Yxbi;Lg>-W}jaPCX-JbOl<^X6{F@iWD8}Df}V%u5snAJNdD*a;6%U|x>FwN z%*KSbFMJz#bfZ$3@n-d=^A~1OK&o0Rsu8*`p4Vk8kn0q-W`W{RawQwkHIkz={?iahuaza?geUZ!@$UX$s&q zpKP11=&h99uxt^;RD81J;=uwD?{VtqAZN~)wNc1<_Pn3TSvu!3o3z7j&`nFySA zP7}qF_h{!VeqHy}K3P)R7j5njWxs;B`0NVv zWZ28V0(sE@Ez1ZUQ3wtF=8SJ!WVW_USa6#^+z?=Uo&oh%CEm|f1zTJW+wd7^S;+aa zmWI!;ZR$rnx9~OtRJ|l(MF2qTP$xUjO294bgCR5wSeCvO4Ouj^e{aYs2)t6xejm(} zDZ`Ix%-M>?_mEe5e=V$yqRf5Xq<|3pcQ7S0WZrJm+`{_f0ANCDf2H;331|oaP-wWX ze7tQYY2f_HOo^HO*MEwJW#~Qr+2vYAh7u)m#i?lmfd;j;FzY^??G*2s+UU3N&)=EC zqT&2{F(~XEiSuO3(LUZ<0&)_N{XRyXbyAL|_4#vod0vK0TNkK3fpgyT`Q(55|4d4| zRZFX4xZl!li3^#5G{s>6p%!qU+N88O`4$ru5R2w5^!M+c?|EAbq(Th{a}TyX?H)2T zA{S~PE*RFLvNa3@OBg06TUH|Y2TU+hgxf15^4TpgR+yfs1}UE_KyO8HWHwgOx`GO< zDGPhBDQ;sm@!$OkW%6x~v~s_P#7;8_skkrB^=HIsqwkp>&{{blA`!f`AHM}%s<2Fb zpWBAJ%%!Fs#D$fjg>~LadiQL>fg;O`B7ZhfsObIiTO@=K>}TS$&&wVJH3WXrb}4w# zF*(N|xW!d)#@vF8;EenPG$^B%X<|q+;bCjP4cFu|~1YsM7)Bc5&E%1J>pCNiu7@weS>HC?VqZyySqPTu+3k2_pw)~!b z48nevpj~v%zy2fk*(Ll;>+kR-`hFHawR7HfP;)j$zXbIoTOmy*Os#wPxpc6feKmR4|Y0A8CaBnI~sEp@X7R#@`nk z)(tBGC9e;+a8-s3E3UfKJe#O)EnN$U!hrI_m)Rch+;-t@cl7p(EBB?QBYmIAK4F+; zxu>TVp{{TS`lU@qfsFpPg0!zlTe7%mjLiN`y$HxOxAyrq0K`;M-)E1t0ik7NE%3WR zLtELCu--Oy&pL3#mLuvX_%u1&e~N&zgwP`C<_Gw%_cgZQ6|rMVr-7WQl}Vaf|tR$Lel zRxr-YiCL~pe132vxh*+>R(1i1MjG~7B&N2L-`1yBk}ZV!9R|kqreeW@&B_m6rai6M z3Q65)S^T}tKWka%bC7VJ#1b-%>5%;8$6YIvrN6TR&V~8gGq&cq`tul5ft(yHD+BE_ zXaDt|oDR19qtLj;NK2F8)-1bL81&XXTMod~J)yUwQ7uRgT`pUkUO6~{83qEEDO{5K zjO6ew$=m|)HepHe@H0rzQ$`AdlOSA(TTEaozduT>+}@VsHbSm`sFx<<;5TJHXQjS` zex2u#WT1=Qbhy302F{~)ZCJ^kTSiZMBe{ZGsgahUdu!$M*OB>S+k?!BX3L?h-e4p^e(N|@+Q6VQag&?WOHPAduzJH_8!=5vB zpWB?CN@cVXJ@~s;eGkP;;+F1bhaZePThkiAMXY{putt4}xZN*>)KW3Pm1qcqbA{)E z-|x$)-&zoeu&rCS=-zYXehZa^aBZ!71tR@D;BG-qoRjpNWx)7+)o+n-WkknH;!rLm&6Gp_YGlHZ3t`A z>q=p_`+^A#f{NlYk)EVN$Xe+1Goj86e+aNkbu6VAW>vu5jk zP~T^WQ2MG?tDbrxSe4`~!r$GiRBWr4=ANxvoq@#QM-DQ6zMrM`j9jam6B$HJ3i0m< ztYxBQa{9vD6DYTp2zXC~Id4S+A(?k%B1EW6yLTEmFP+a_p0S=QW(&%FytKj|Wzj!; zwPk3b5g-_P8})r>e-9Vg=I_9~Y;!cd+^$WHlER7zv4U&Q2*9}nlFIfP6ePcLJ15TD;KC=u?#P?TL-a8SZ6U>l~a{w(IX}w~?^&^Ik4L7woSGEYYF`^7)H+rEu^32wHn^dHKjZX7j-5&TNk!en#R*{ zfn?np1Jb^6Dn_w1qMc}pA6NiOk1l_Rxl9%B4y*=xPVp_aIo4T%d zrD9$|)f&baj4xTNpXEKcJ8~(^0)^|>7knqw)^EdD|4b^r$hVi&Ae}JqxiLNSsZc*V zRsF%AkiYFQYxt&jTXlL`4+eu`d2RsE?}Cm|XIxWpX_4n2s(4n+ry)Cm7QI~<8rhm~ zKjlzQ7Q4HZcG~@PVm*GWPUrP6!W6@&G%auui6IyiD!B#U1D_MTL4~yo04#*j!U6XF zq!|#XQE%88;$Sw_xDqGYSI$of0VW&K&w*XW@UX~w&O!bi(*bxd1AJJuT>1!}9q7-M zsd}GC&+xO~OrXt7kTy=~B;K@r^W@eMWcEFH-_U~Sbqs!o1!b^?u;E38Kv2&ba=>wLD}h25ieAAlvQ z=f^g7uFDSrWK$8HD)5&RY>O+w1X4-i!RHtP#iCRuy!LwXf_753 zmk){*POEP`$#{nsOy!#tF@hMiG?q4DK3rTqPs~8pkKvsVfVDh!*}g&BX&Tq6e6~s( zFzlVeViejeRsL?OJ$YPsadao<7uDm%2X)DWPQnQz1CE30@mcsR#DyZ z>+jT9wQ^I|qNlQo$_)VDBVvKfwv;F|3#@>3=j-ra{|N!W_xa)JxUJ*2mgP={;!2}w z+|MXXOmPWpmBTp!fE3mT4uLS|Eg3(=1M6Tcq_Ij+mnWk{TA&uBgA^QvK7Xag5OGZt zX6`56(z+_L3GC;BTOmNR>Q7m)n4kuQzA&|%lf-NyAyA`XU-8zLk=#+1kadgiLx<38qz&eh&#iol@oFk@wrrl0oRmJo*s(9Pn-tOlBVYD)=C->pmtdSsNWz8 z;I`#+dq1PRhvC0%D#+ss^i)ph_oXA$*Hc^zAw(QVXNNe+1G14; z9+;A1P@6(5?0(x}EkZNId#&l3S7tzB2s3+)ymnj&H$qUo_1WBDLm=5oo-n1(<6AE% zSeZ_r!}e%rI(dr8M%x|zEvr)O0*_dU!XU~==stVQUEG^h3UftfSW9n2J#Zl1mvtLe z=Yq**O06=u&cs)qO;+b3(AF&eKUOU?q;vP}lw5v(qO9J$-1ILbhX?7|nR+!o#5T5l z_KgOTy8I-*UqU9FlmJH6u8}^h$yN{|l|X!4z~3viBZ9NtLq8q@m+5nW@xl8kwhC*x zCxqx5;K}{t{LUj;Y^ftaIenNaG5;KmcK5z=P`%KG#bX~1FpYB#mkX8<9`7%Dg|5M3 z+T2&?T%mZrXO1Z(PyA)F6046g>zQ?2JfB|7Vu~nq(;pZX1PyVn2~{pn0$x|Si=Jy} zU{$fGa8jSrQ6S{;3=vLBs~7R{h5ikB!B#3@*vanOpnR6Y z0?w5EX7eGlm*S~Y4@X%}KY2l>f*5sKTOKmLGqdcO`Xu~h!9g1&ci&z#9ZfwnP{lv5 z=ckxaqCNZxtO8m1Uv&e#1{|JL;eUQMU0bOF`;6|!SfPb7)Bf#7K{?r1JQUkhRSQ2X zXUEVIj%$rU8A6R%AUaCg-g_+uI@R2ewSvX~JLi=e@UUgmOtrv5OHLmP08(&AqpRN|krtoJN;1%!a&!tLL~zwxV*)eoQAeUaA5-l53m}0@leu)s{tc?8}(T~ zGU)n$Z+((7`Z0&j_PknTv6mFPm$`-FkQp-)a`_haanPI0x9jrP;2D*}&O*W|A;9|a zlpXA+!nZD%yQODe*J2D4%c|fq7WT>z1mw*#uB}oWn(e*hAxvdJM*utoLDf~x*j*}y z@Uts;kieCsw_tGf>)N&YHPsfWubjsK_grXI z6Z*MPo-auNVbwR)FSRL!FoS7EA?&{L=PG?wmlk?+515cat>~ecS_JZW85Hmye!u+o z4~8mP0CHi^D_W63qHc}@NH_H3AB6ts3yEvRXbq9V{W*|rPEh1LcbnVU&#Nsn;)QBu zk&gHtO=%MREU*_U3o*IR@4OFi!1LmtfQwu(F@N``w5h-}wmx$!0vLm4Z`nDv1Y23H zD$H-93`W~z4A!#VLQXaynC-WpS1XOkY%y~%41IzyVFXMi{Hd$Rh%`biEo(^Ow#68~ zFLBSDs}Lc#-$#G>I0BaL5K~M@ZEary+yA)X-p4g%>axbwu^SwQfD2(5G6_kC)8+Ie z&koDQ{i)dc>FItI;uoFJI{GIODF@|q_wy1O^-T2~bMRx>FFwqoQg zHV^ck@re;C|N0+dlXMzCO9V8|PWh~oGpvod)kAQ#&r2w%-Hx;oa=vE})dHwNnZ*7= zzGf=zg+V1fve38QViZziLCcu+bC|?|^mgF$7U?|ue%`*%5WlYsw3cj#+}>z`^ByBE zfE+w~4>Knk)@phh-;>YXx;%??^i})pZ^i3Xy$8=^(W@<^ZN1O-hA7nSP0g=8CA1MO zHx>%G3et1l=a!4c)&eXT7A*}hC#2<|hT-R1ec=Z$$yIG+Evb#_uM~jXV@{cY`Fu(J z<_muBIT*KrUMG)TA(B+;rf#1XWdfrV>U-=byHs7(J;8py2lZCCtPOmI2;973ab<}0 zck(;yUMA!kG|Vm28im-ZtK%4D$Z+HLwxxg!Sliz}UN>B(7VfP**9HHC>gQHM%SwK> zXo!l7SE;F*&12wwQs0szCVhof8QD%KKrfZ8UeUySJiSPJSj%3>A;WNX1f zwMuchA0we_ME+HUsnuH{8Cn2Mt;?Nog<(fq4i_JXSaEC4M%iM`LjHcKtHM8zEUEk= z7u>QN_gI(%!)eyr!-W(esUHj3uBp5So)Z+zq~E?ISxaIGt&b)(pi96g0m?m0im}y; zy0xI3{Ve^j|KV1>9ixJ)<8m|9uh;2kxBFmaWeplF#@fnhaPcRL}+exINo2M2EG85TvsNMDjN#7Glqc zDNjW(2(1wLIGN9_RiH%ogg@^N|D@7nrwo&6+pUcpRzhYJa-QX!VHz-S_M+e-=UKb4 z9k`5&_pFvRRarU5Mosw31RA|~^%%99N+SI8s+&V1KgK?#%>l@$bi-UtyNogt(Y5ngmQ5f+#Nps?UqM!Fe?Cb; zx3enKE9d;YExWhS*EOBD0Y#f4Rw&r@K5?&6rtI(-i3w`j2&ip@j-MC&i~;}qJ)F*f zS`}pgBmRbZ2LHFAW(uGxwm6@!mYbmf@(Hj&xE#_R$y@+W_C>!B+_3oscfe;Kk2C|b z$U_7PnlvLMBXrfc6#!tDjT7(uj%5*RVtt+cBYw~p&h0Fxs-96Eb{}_*5fIl`vtWKd zPHqcRj_JiW-VGqA*mP9~lwO0@iBTD;fIzWe#Q83)82*um}s0*PRaAz)~&!Tb4LJAm-Li`uw!Lh{XU8vwi~B$TT9 zTY|wF*Or!J4xFl57-ukuNVVmG@?9fZe@)528kYHS!J)kbqR5}!OZEQ$Y5HSa>AEIZ z6N>+rz@!5g1BEbgNDB;{v=9#J#Q;$68BjCmGlN?$7JCMFVc?<^CJt$023KkU22w~^ zje#)gy90RZJ*UpzD>I&m%(a-Z2n}G$;1aG{h}dPaaPrY%#mZK5C&CA%dy7h;U`5$J zw;`bYq*i?1q#i=gPYUAq+Z0-}w7Ro&J?X+P#tPP9cmg(w6MiO) z0kUS16%VZ{LUGL_2V;yY_N3K+*n|iR2`mKk$2*V0OWV5Jp%~3y<&bS5xPZk2n5*YTYh#69oO_)`BhaC2{paNAgGrm#AP4=+!O%vf=ESETZ`If^fJx-Q%92%GaS*BhJBwMi3`bl29Y2G`#ZzM z%O8w9xF<_s(14Q03(Cm31sDZm*x?p9&nxt;Y~MpCK35v0!_+zogazq-51_Y)=-ad~ zl%ew%?gK-vQ8`)y09LGF;7SijApZ5Q{gq@Ia+_G!NUnq$9{|jTV9j%ZNXx)dQizcQ zUzN2okREZ-2pQ@s0cg=Zgt(@*I6t{a-eN7y{Y+%g+zQJeqa>WBQCKV+U!ky7Yw4c* zEKJrb5&&T%XVgL*da3t- zp3=Wj^-~X-TD;ibF?I`E2Opnjh-!>plCgnmZHl{Q&Q}M}Y1L9(@s^)k2|89=pjjwU zPP-2zloBh3p!QRKSC&fZ`{;xJ6|z~p9yZ$$s;*1)OLGy*OK`JSEb(v+rjKtRVO6yN z02P+M7cg28g6W!5n-opW?#IEp-k}B;r{q@Y!tdq zfZa2p6rr{eWA7k($}1AIY%5pkxyvR?rEn1cv8qcDD>$hcr&g2+dzN&`mrs4-Vf;1gru|H5Rb9CwtOX3IT=5&iPcG z52;{bzb$<7%a#O4)}Z=HnCqB`p8=;brQbC%Mc=!q?9S$GTAI_vq#;tb z(nL|TvJf9*1s4EEp#h&!SO93VhINaM0xJ=JxOH9S%AS#S8ufV#k!OwHJeFfI2%t4MB09>Gw`c_#cmI#U_O#XB~Q{K9if|PCr;5{!p#VnJ6 zXQ$t;bsIpfu>4VAI_;pe0Mg$|C+p^dG|c0=6l%yhx4i4lVT!l7ZHo&Q7w#3emVwBv&7RaEB+5}%RN(5XA{~dTiyeC zdzsqClwKt?R8#w#O|In`%yY@DKw_ z`<_fE2HMIMv=u;D$$_|#Nw@-HYgt=wg{i#iFk)~k_q(GUUPuuJKS!Vj1_jW%Xa1pr zmUjMjTJg`w*|RAkidQ{k(21%2m88QdTM1uhK7d#}k`qq8C1Ub)$w#r%kfHZ!_ zug&Ag3xH(?TS;SBe@DPRw&5lO?4JD*i0yxmxycH(DyObhq--(?^h3@H35awR+ftVNAE0Zq4G9=5mi6E##>%tLKmnH1)&d1hI)b5duW}$ON^q z>I`8KESB4Q#Eu+6pC*hYF<`B;ziDa{7KRh>TbGK1{}YFin0{*a}3VIAg}DM*#MA)MvNq2sfuCngPIk8dKVW?)?X^ljB`5PgPaK3rAf zHzeU*Ak_cOSt?!#RcO7~=qV}vBNjvQHP%yj$hSCPR=VM(oM9p0#Ru|bcv|9@yK-%Z zeTJZm^_R-bew&E%AY!UZLkuH8Q=CBLbA;ex^h&Uv)^r?LrgNi>HcgK%}xP zZo-W_9Is!oH9db+@SRUBqWd>1KcAOCBPs`jfWIaQ{M5rWmbGkkwxEh?iJA#&xcsx4 z6c&4ghN-V3q0O-L^)s;hrxe8S>~0D;UQzKE?$<=YxLn{53m0X9)e7x||IJJsbr-l{ z)T%*gI@!T&u|7z!>kdMzQ2zHxii~UOM)rp#VT0fKE|G9bj^v~kf>4OyY=ai?z3&+4 zTNl|%$s6M)KepZhO~u4Z6)TzA!H0{@1(1dTgV?@Mp?dA^pV3L2nH>05X=YF;2&lrio%Zh`x(k}caH@q8?*1n}Qgi(emCbLn8GzhMX}W^wW#g8G=lYH4 z&8IO=pyMyCUGkNcZUh_A;Th_rAyB;_ZWsY;2T7u>Fgg!V*@sM0UY3NoT``rbMVJ`! z{#nAJiyO-b7Kj&LV8mj9C39z#SCu@RC_x1@`dd$MH7p|d<5u0>ta=aQv{6xpDQyU% zyu>aEbGN0!bX7%}g1GDMpP7@bV<%I!h(8Vef+7kYe*{(`-tK@7&c>(1`HtL^L-CQcml#`ZS(v%)@m_42g2*>Cgdk13_t zpy8X-0S|M?pDOhE$t~A(-4#X;ze$KPNI;*Qa76)?kd@)qk}+=qV*N7&OWeA(gS3nc zuiu}8E+>FEpsG_yW!~W)2$)N_^b+GxjI%3x%TjVE7JFhrp0tPH(Jr`z^4U%4cxR3Z zLEL5dunf`IOMbz6RK75H?7(Bz{+(ne)Kpx6k_~~omx8>K^Nakj76DM1A1qO=&1J>d zx{zHN2+ArEUdqj(miY?EASdy#$a5qJG+Hz>sFKb^tT{2^xS z-y*wYZldtbdP$(w3^t5{Eg)TDy()nh6bWmq5&92BQ)ImUj(Kkd?NTjEEmq{l|0GkL z-6)FPl;5hCtZUIP%W|2p+RXO}iqO_`*c()*x0?HO#H=^1HP%cjbfIqT)q#bSAC%fU z7Sbl$P`?fO6^UlS2Cl7Icnf>*-&swqzVkbQX#jF=QRb%>AI7yqT&gZ#UV+Tr&&SxIy8-{hXT6VYrsnCxTeTfN|&_1wk62%KmB`g_u>FT zxy~+RVE*+VzkF~%Z+e2`<#W-oi<8GRe_<2ev*N3^36#%Zt9KD;QR%h24lEoW#>b0X_IzDToJWI*)${$IBem97-fxzGF-GT(;M3%3`~?WOUndlA2?6uvwGV+eqM zc?MYT^NN5RDhR>tmFxW(5&-MoE&zHvNyWWF+0HTQmDNreZ}slMEfO>^jr(UzC#fkS zNprTeZO9=QQ&@}mHOy8Rn9kW}v-C-jg4U|Mpk(0+IUAgpdG_M(rW_PPm^+MGtu^j7U|>Z0CV8|IPO;vCR7c3$oaffw3^cP=<4Bxd!X%vfyKqMN{m$P zxhcH-=ZRWIV7gvfsatxLPq5IU2A1^x4c=!zw-(x7^q-1xE&7`#e%gGLGH>-5Jr(Y= z=KO7}4J)j~>TeaUp8@>a2PH@AC%~NTTQGmN{ZDK>gAl1n8!(<5xL8}4?C0UiHJ#A? zBag8t2Ov4~fH%&)-e5<|Z;&TyV3y|xz3NCB~0+4&QWEV}lG(h|@F|v!(*bd=s z)T^R@OM(n&x%_+!Io|>puD0N{O3sBD)9OSGG%B;`t%X z)%vS#l+J5#Hv1u>g3&N>yAZC2{Sm=;oatH#;?tG-p^-|(#Q%ZHt5FrvzF4}J_ac0` z_h|AS^5#au51!YcLxS~%#~#@}b}Ed4;zI*?1bsQ^M49cad3bNv3D2HochCfT%>P19 zxWNm1TK51ls=KJ-Xh~XN>15*{_pI*t1J;NA7tu@6{wUOi zASfzG#TwSqqKS1E9N4HDc*OUguUmo#tzvVt#NNTrC?YEvDz6dLCDY_b=&&wi%KVcu zdj$&#QbFC^~BD*yW5W0a^~ zKJ)tpKZks);YH_t3?Z%%#;*`B6@Wpg0iWxhGAi(%J%h^qo3gAuFG-EvkuDh*?}N9E zUbM5{S;%tGqcY->7d>SfuF|V|pBOZO^PYt=ooLRwrdJ@MjSn`Bzt&>pffHEgygb zaU(6FkX(>$*9OtB99hwxbs@Nard#8M$Ti zdEZXw{&~(-EezCJfSYwI9H^!;C#&sgyUG@)7v#v4{@(rluTZ93vFMx&G`$qIt_nb) z6-=Nm>wTt7z~#_V^qxdq7J`4$ z`*P7C7uxDeQ;BvDc*O`Zr>1_E2*K?U;+i!4jN8SVAGNwqw!mA!R+(9yzcj++H=HuX z6<`6b5KOqNEqL2GJN;bRs9Rc`%ky^vdLHtE{Vf7Wn_I!AQ`S(o7A*Hda7b$_B9iSG zAdr}4{aR4|k^!6po6uI4pV`W9=hs_Ya@xl}R~+Rm_Po#gHxByk;HOqoi;f|e-Ck6F zhh^%Owj%rS@?7`Ajf_zEBDHMoD`33{IV?iYj-USFcO$gyi5cJ#+H|2*B0 zf|k03yzrnT^Vf5w)dUltc>y9*-QJu*{@X&!ivz40`%l{eL&T<0NUTvs|6o2+iz{@f zK9Kw@dhm-n%1b`^y`c~9Af^H*f#dZrCS+(j9ullx7#+01r;B}`QhxYqORj3S9it(w5a6l<8DUTCDS$<43aea5seVB2Yi4w(gSz}5wrGqcf{dHCkq z+jKt(d8V%X=W~O&ki@9f&wDt5E^4(~3pK&EGyKz%hu?U>_c^i9#04`?rHkYt%ehQ} zBz6Lm$z)IWB=Jokj=dL2!Ss76-Hnh9Qc3nTmvg3dazi8KI*b$8hqYW3 z4?1brge4B2Leor6>UHln?`0n9>*Lhq?_d9w8$VSQoU4n*HViDzZ|bQM!`haG`m3a4 z0*P`mRHBQvt{6PNrOgLJ_mX&fdNog$3g%fYk^*X;n|=jVewzcVWq*0hET~l0XYv_s zC+_<(FTrR9AYnO3?W2p;?|M05T-4860M0@;($9>TO8-MBH4?EKw1XbV@JS$LQ{2L$ zuJrRTlL_ctP8)Y6a}xMzBSU&{vR`hh@~vg8=eVk9y<&Xpl%2KZERWDv(f-0S8y$nj*~Ju$B7$eZDDF5z|r&3xzFlW zej-!4m4pBUU{gr_gv_(NJ;y=aXY2D_YNG%!wKacVwsBRdp|XvQA^@8FY#Jt#m!#7} z`y`#)dj@_so$Y&IU7lr)ZfVo2gy+`CDKpeY2#JvE6XT39cm`zRnsa-V!uI;Sbc?$v zA|!`&uW8hlIQTV;03AeJqQKiu8vmx?;tUO6-R656ztk;~p>UH#WSsSVtk!~Noo$4> zqMJ83UFgMWNT2U|!`yw`6Q5-O2oBUGj|!vCBbsUaWno)vLN5-*w)wcvf<2hCGA(rOnl5DuL5^o&Lg%RJ^+hMB>tLleXURP`eI@P^D?0l1% z!M=v{-rXXrodDu?v1;^}feBS<3FrU^0WpfZg+)q|OIqaD3XE zl~;yw_p;bnUo-)D5V2TI&*Hfrh36mkUg#en&gxDHRG2@b@Bj}((9yG?JHhsu5WQ30 zvZLH1Rd$1lR|gB6@PoIr|6eAck#rt3qXITHDGc+>CU>2s*%&u@=6qS(^hQtm_IJbT z7lLu^O@>=5h_7q#A|7BZkFXa^&AQEv9JuNlA_v zY*Jw;#KwEmszvCtcsnsDBG}e&>)ZZ)y9SXxb|Zx8IRF^)1-y*l;2u@oV({o$*E7o9 z-JaRi+2u`9TNRSro6%TaHGku=svTIc$S+q_hIf{=e5oTDToTnsoQU?qcY)=Y)TzdT zac~#GuM$5*k=bKDm0LeOk;Vbl|vb8)WIBB^wo(N%*P zQ&I{lU`^>$(jG(*Gl-oTeyjFy4tGSCcf#en{EJb(^mSMi;Jlv&mqK-~2x-yQN3&*O zzW^%mmnDB{kpw8BUlH3xom^sB=&3^=5jt$+An?GA)3%b%Vw~BCJd@3+S|>&)4PRUgf35u%ITl?h+=QE-w+FMXi3m~6_KC_y9!nGiGk;qaVPv>nQAs5v0+q_$np*$ zR0$|Sx)_8g?W>5@XE$eRPoj!ihN`kV*_hAn?`IClqR#)hLRw9}P;l8NYCvwxn2vr` z30V4H|E1`D|JF1}OBYq^RAiuy`z&;;xqKfkUAa#fWwbyC3bq&ewdM;`sr(?)f_vzd zI4Gw_ptBD+vOr8k)M`=q{Vh@$F1tN;9d~@Lc}6PgFr3yJ+a991PM?>-UfG@NhRb@82o~~0JF^Juek~r zifU*&5@JF3d7Iv%tNzs2N^RF>3Iqn9?=eHJ=7h%2)3B{FuxAM~Bs3M$zp4OmReBX} zB$G~YMGSg%59VVtnl@L3Au3aQ00T%3xo2w#ZU6nvKgS%fXuW6{A!OA~&56tB0u>{Q zRoWmts_8?gK2XZTH2qunKQM% zA9+79;wC&IUGuqn7OaalV5EBh$Y5dt2>Si|Y7R{M6c&LGFbR7JS3+i0FWMDr5AQKn z!vmA9`N87ep98{JAOI+c7K&@h0DcT?1V0WPE-QU|loTDvdDi)G zN|IXI)y4?tWzmuU5u>eW$Y>76&sM;N0^%~L04Q*y{C>*paYFqQh24PO93D8 znJMiqA7-pVsvSo3n|!jQyr26b-jY~})?uoMy~@OFa&2Z~6ltUAMN>6gd+Sdywg+E! zpU6xl0n@S#T00Yx#;gdGSpy@Fr+w>oKlSYXKZx2k0eiiQhHa7$6*Z){dkaIv_;W39 z_(+15XU&$pNdRDPCkqdeL1j~z9^_P`e^u)!jZ!<;h_~7wyjr~8%2(a6B1;~BchTZU@9u6NjeN; zuHtA&F zqU0a;H7}ziS~|haWhP8Fx#a7sL_x9}-Bq6h?EY3R?Y{*VAuwl*`)8oi9(B&fKf57? zwD3k`Qu{fBE$6V^!?=vSORSSR?*v7kRS882zr4^kzz@@T$HPXUYO&fB zC&`obayyYK0ooYo7hCaP(W0)WVCVVV5vrikL5uDHpKjU@$+S#vfU3W(emh4`K31@$ zD4Ag?)7QFc7L==3L=RaTMCnN&%0K?xPM8=IDPRM&k zP95LaQIcDVQt86}Lfx(oFVYq7Ou?WtG=$6Bio#BQy%fri`2IP|_JKgsK1E%(um(&sOsk+z1Bi%V`k z7t<^L(9JWhz(U!PwlDwy_OCWgZ6{@DB3FJmIox(0-W4Mm+Z%@BLQAkN@f+8+qWXg8 z768Yc3KMbAu3JVqu=*7jD2}ktMXfZwWN5ulO>4&!s5%k$S^mJI~`tx&sQrNULgzFpJ5jA0x5x4X|< zF8*c*BKHJ63^7%;fg@m9(6BZ=0^oB?U;RivQQ8%BRYMnQYijB?mxb-TjgT-k1e2WG zQEGY_m`P!gMq~x9D{D(!leM)pqaO=E2+N;0k^y-|jpX0c{_g>%%%Hw2q;taeEEKUR-F;c zkY!;?JmCZ(3>?CBbf~^$l)~JH&(B`23s6%-B|jI(yHQB-a)L2%F@VV5yru?AC=|xa z-7-vK3+8|>{7Kb=XV?A(13>__y(@8v0|cB`>>Tgrvjs~w5|%maq#UOtza_6@tNh5J znZ4j}q5a`tEO(0}BviV$F}ZRPUE#i*g;6eU9l+9eI8qE>f}6ZFiXc9KVd=X; zD#M|a3U-)*uGo7TjlT-cx#f;K08rOt<^bz_o|C$GvF|3QaDoqJc45BDC#~iN=e+?L zRK6XE)$jw2bvUw`s@#pRWh0cq}7)|3$>u$QJDDHU;|s)PYuN|d%J=iP~uA`ll8 zXLniJWVBw$tZ`BbtxlnP|kWQK80=46+=AU}MdZ;Q6GKai~(i*Itsz`#s9VU zIgV;8<=H=0cfTaYER6Oj>HM3e$ICYI8Lg6HvBsz4nrv`-tooD$qo)NWwUq7!?XSb` z2Py>SS}%P49*>$dU|<#*z(wa=A!?^6AY~0i_maiMf(1_Yle*B=yf^b(p-{ zCfDScmw)^=o6#t-KrSKfwU=COS7M?}oUr&>RQ$%G_irzJM(NcLM<04=D2D7C>-Z;NiKV@Go484q9w@?Gx zM{gYpque6<;YL-?pa!WeKOGj}neQkMZO@$$QGUoPWo5xD{jaqHi#nMp zDzSIlNq?{@V^&W7^&eN$;iAkVlyGxbOQb$92&#U73ct z(x|qUOE3kCw|v&{iBt2J`?+|NpB@*IBiKI(<#Jf z5!S+eOTyOF%JQ^+T&PC1KDM7c%= z@_kn3d)Su&?{iz1t4^=RO7WiJo8O=*wic4$DH-{V9n^pzM>S`t5nPBcsfA?L>@f)k z)UAsC^U2)usre^p-B*-*$mPEeIOQmQ1@1Y6+C# zP9j@{C1A+Y1hJGs*RUcpAwbRf0xb80E&#vnu7bnb%2iST&II+IAe~`t&+|5JFykL}OxTg0FllU6jn?(AO{?!*K*s`vS6FU!x` zX-?JF-_aaQ81jeYR|~n2sb{d1{al%08$i7kI1MwzFCGt48&a^m9b|h!_F4g$tzK{? zV1P2t+C^OynM`kCPNJ7u;PWh3{d``?u%$^Z9| zN2O;SxbR8|0i6_SW&%jdmHmJ%Rb}#B8UeJWpBn8V(IoxrcJ3~s%0R1Oh%bq-3Z>>Y zmMNXAZ0uF_VOe2svK}oWhy@H)U8UiQRc(gvC0&^9^^-l5;J!-i1VNGTB^Ee;Bn3Mc zY*~4W^W94uRnMI913eKyP+3So$Xj|>%XqEuaamx2@I@j%E;r`=(Hr1}>hRY_gH0`UjX$uoviSwT4G<47)c=&EcWWsuUoO4G zgj+4LYPFTDD-uI&`h!Wj;t9Yh6>Ept-VDGTa)L5o>-)+z04tgjOO-4V+J=UOXymgb1wK7Vevw2nWmM z%h{vfM6Uddb|c%`La^HT3m9j@Cu~;Uk6cBrr2G%$E7f36l`FOCnUct@9xSIJTda3S ztWdA;JM_7?=gi&Eey#x-0K?yUmFTTpqM0UFe(O=ULu18XqMoVu|MtS;uJ)}jaz z0F6Q4%VnDQHF3{!dF%R7>S1`vKW5EyZ_EprqGO>bUqQ3L&*1fa#B6=Fz)wgzWqj<% zc@eyu4n)W6ybC+a;F8YlI=^V7uU5<>*XLjV0nutvOK||!tFsnqgwSPKQ#5-O2DXw) zj_I`Rdb=D(t^v;_H?1ogIfML+p3;Z|)he7H)n77O_Y*VEU@P{s$19)J_;4T#Gz}mS zK!aflSuIifLsPY6XwC^7l-)|u-ZGI?R)+K&>&Ca)cD-^XN+qE7K3ZYfZ*R~;^Sod$ zK!oHilGCwn(U=e4d`1KIEoCKR6*1ATjp|F67TDs}`@X55*EyFVA>KTeVT)R2`d6d2 zrr^qa#xC)9*acO85G;cT6`vn`43y%ll;g7n+&1ZQc^j#Lgmoh*BjADwRDaFTO4;az z74UfbeI`Y_O$|vJQ^-kf$HlFwJ%fO5)L$dy=1gztA{D(&CMaAeu+A%hsSE)$upw#x z5I}B&LvdAE5<&fA(#8s&#M*!g7~>53$2(<c5dBzhS;p^!7M!y=T!0X+hy{9e3q7eRpSON_nJfT6 zJsgzQ2-|fPtqSI{7V)#>06wmP-0wkPh)9yAu^ozER{x=rbG7H5VI>s|rMwgi_IACb zxOyMGHSIDPP9==Cxb<(?mW3AzY3?=w&E5icqcZS>&eX=UOEhNlj-7XM+&ih!mD!vy z(lx`)CxdFF^_Z0*k#^-xrHJd;W4USiB zau`&(rGWMEM)7qL-#}eb+0#V+O4v-8k4wxt!Hd?5adI<1&-po%&X5Nde|pn;*Zh_4 zxpg+i(KgnA$x0y-nMfQY*k76Fe*e-NTsIPS0p9AH*Zs?f-v3Lh_#3zE7u>7cEMx+c zHt*ZD(nw7PS1+j|gyQ>FRrL9=o%|scP6}jTFtemxv@sZN0F0mHQ8^nFOCi)|CYH18 z1=sN!d4hUxy2u|ff)Sn{IVT!m?JjYmhLhwXKR%MS*y%vu6&4NdK9TkXNwK9jd;AuUBVyr( zPmzT{|E5}-=yQlop&)LytC#4Ly6k(A{QlrYLYI2DFmp~>=^{AKuGg+`t8?^ya`Bhq zW9Zw+hDBX`bRlj5e4GGn0rlc35m?#bRT0b0khM}&`6`=AOF}2ilE;NO0Xxw)ef>1T zgIV3vh3Y528`M+j-K_(|E;FOR-O!p~l&+3GVkz=&DkbWw%3CbktS&Q?mKp;|-pZ;D zP0t~H<%^#wE72;Em(2MM?fTnO;u>rWf7>U4Vf`d#_!kBO}x2E8gqMqPUP zgd1`d-~%#Xeoo0yt?7~}y1-dom$P3zwivD?$~9L05qd_n^z?D!HFAM%W8fa{E!{ z7Su9aQR6xmc30mv!`|s=g00BA95V2uqrF*!k~>5@wYLS<0YNi77O(+(Myn9`*MC_4 zTqSDG+r$#((BhUIAl9^u%o%0F^K!Do11%$MmP~LVy)4W@$1oaClpKn`R+uIAl zC%3@Z`UM2jLuCjB6)5{$2yfMN<}7RSET$ZR0l9c!cn`SgsV!Z)Kyx2Ydi}ZS6(YFj zyeWi9Iy4xU1+7^?XaeESGz(E#C~mx^JzM^@PRKDwE0vj^kZ*@!Pu$iDfqB+=-ZSEg zzmAryo3nhIKJnRmHefsd#eYBK+j=YJvsm4OZHu@S#Ra40BvF7QooMpdl0gTqXZXM^`ah$SN=1YH&q`A2-v$utte6^0-$DCsLIvfkd> zHVCBrE9akETa#zYCepWJo(*Ecz3pf0@=vQ@K}On03P#GD$B7swNGGrW-CvVpg1mog zoLtpg+e^wm1BG)iigmDlK0;1>F6!2tdG7IYc`{Y60OS|1n8mYFI3Tx|O6r!Sg)77_ zBKV%<;DUR?pT$I^VjO4KrdCTc)I_SArqQ>NdshEYPT@X?p@C}ut!e^c13eW_%6`4! z;1L~4#VmD%v!UF+B1#ZseX}THbB|Ae=o|&1R|O3Z7nkI-wGRpkUr{N~;TmQR_czO( zSu!#WUc*4PGMFvL7tj8(Yt^(FWCpWOY&@g8c=NL(!y(ElG`kxg=vaT0Q7o0$`58$2 ztBV+j6vb!y4>VTO}riDqnkmtAnnIuAHq=E&<^paPgz=R!xW z4JKtl{BHdV`_Ij!cx2)DyEv9BLhV(cT290p3=2ufy0Rnea;znwQInh65NT*nQ!; zr%i7dOre2ofO^pA(v%fexc&hL}4}*l)$lOG0lWNQNe9N%dX9obaBN z0py06xb3a6Q*{!TiYovVCjZBKh6}!l6CcNYwofhp^hHP#)FVx$mc8!9Z<=wTl`MV_Tyv&lrpKSRf-qYGg1}O;&nUE3L?C9B9RKDy zU0xPP5o%@1e_t*`nKqtV+aoX4N&iVd&{%)X|JVQQvfh&df{5Qwe{-&20JU@WtQHN6 z8Kv%_(-*a6liKpHNdYj0ttpfl#p060vhBbf8i8_bz~`@f@?5te#pbp~H0qT!J&&zn zibl#n2(F*^Klhxb&lqev2_#W+6z3-u*0u_oUvouo^+;&w1)(n_K*SuFIXF_OY?g+h*GLn@M5O1`^%#d(al_ zVW>G1D2#~9u$Nrb1%Y-5`1$5>eCA*O6(c2PXLH79^!xGzxL=@Z-4lT@LRWrWWmcin ziL}6ZbCx+?_#{kzg&6g12-A$QswtX)WP#P^BMjm&3Ku1y4_ zU0!kA>E)06_OO%+&qSx?mdn>vn6E+7v8ddRVJ<(LThY zznz?6ZGDB*GKw;v3EP8)wQ?f>$7MwtQcDg_7BUDQESM-1mfG|bqPKO)dE)TM0~62M zsEU=~-plRaqNlu6;r_1XOfTv5@6ekQ-QVhgkN_7EYEztY8)3*-bZ0w!KZ(=chhG62 zp70eKSMz?;w-~q4H(f8v2YaL%?@t&#C zR3{iIz(40 zbDzun^f{RGO=ls>a4l=lF#oMquzuhx`pYFR0211OhhCb-m_q6Eyd|x= zUbL-#ulvjxql-qB<^ZvRms|V_HwX#wAUz`QU%^XC`gI%79~GUF;)t%FnLi!w&*H5G ze_(2>-=cDB#jnYx`5rhvK=fs2Kf(E)Eiz#>TK@>sK)GrkAVG3hx z6H78J2m&Sm6oi~4e~SDVoAtg22>j6G5MQOue8>!$rGW>LH(T+si-uS+JiCGgk07b$ zx8;Y$cGr8`YmdH-lgvp{_<0Z2*)2H7K#{A`#V*yZN!R-3uvG$c>mA zTHmTI0R5KVmT$884y|fZpa*ZNN$_qFjRj4hD3e*OjHggVTpbj%d{q&{D<5=Oljds) zO(B3zP)zNjs@4xu%7w%_M8l~+X+|{Y%%2ku3y*^WZ7RDkkUTYzx#4iW)VIDwk1$!x zd=Dr*C)v`T&cmefum2jz@6ZM99I&O1Aw7c+&~(wxE<`r z0ai&G#I|vJf8&nVziq+!F)Q6?Lv9Lt)~ZF@DXs;A3Gc_WbQS!gjViWe*AEHZhX8=s zeH(=3au5Cd{Yt#40OW|N_r!F09v8j!%d8-1gYL1AV&K}ss&=0>0b-cXw}_4EJpmZk z`)tLOUGzTljB#%DtAsy{8K#y(5N-$TNiB^YxP4yLimwY0SDZ5tusr)aaUuQ+_0Xm5 z$yOy6VB+$SZzF3xNsJB>ZS-5Zy<4maIzOH@&eIEE^t%AwKEs5f<(!M!O1~Y4WP-X) zySRQv07Hk{*n@S~HGU{eX59`k?Ox$MMlDxRUA;~4K1l??0tPjYO@MK0%J#?B_Vn}y z4F`2uz@#rb$5|P_djpI68|N!*!H{Xn?o9`=0Q>@C-=YF*&tG$Tl>XLpLD!x=pTbY< zB5uo}Qn&6it$RP%9#Ve~y@!+1lxe27;2QSlfk`aSy1YJsmO*XpuS5kY%Zg><2lvey z^}l~DD?~~x|IgF^xAx6HuZ}^6_dSARjNh`@~dHt zEy}%t(lnJ-O$8JmIb)AW*D7y3#5d+wG|!@bIipD7+Xb$vqYJye;=u+*TWSrng)^RA}Mo-z&mWiYo2 ztx(VblyYUs)JqT-Ip8D@hTr59?CAdtGCATbuTTebJO{I@kU>2 z{h84ZGA{9{yl77Gk#elgEh^e%Lj!tkUVIC7M~ z=l@pw_c-f?jl(S#3(*VtB;YG9bAtdUOm$F89t%N9ny1NwQN>NALrK~LXc|+bqAh=y zppKRxi3=2K4;w@}G8NJx~qMfdc# zU70e@g40XUqbEkhskjGV`YlUz4$dl=C^L6TmE zFc~U=*2iJAkMZ&ZNo^1A-#k}VLO*ZgTdP8t;uUDtFbwBV9;=p2SfDRXEFX@E{&ay0 zgvFtGmhaI$FIX1wHr&JdHXxIV_w2U9rIsPJj8R+gPoz9+?ho{#vV@SwlThy4cEeNn zr|Dd2BU3qxxaTC7$Q-FNWiUYL!6@zlE3*gFuuzbF~>ex<$bhgTzR1d*Ts_Znk=`R8)0yuJQ=yTh1emF(w+v{hi4Y03Vntk2|FG{+yuqbR}(_4n=!gd&%0Qzm1<tm9)W8)PE1x zwdr3$lGUEgulC-bh2cnM2+H^PS2%Uv11zXi-h*X->%ZMw`27_=3VfWhsiB!O>?JYM zh_QmYA_yAg!0gpPNqCl1ni?O}kc*~`BL5D5pS;}&WIzqKw_x%8&efq8dwS;hK1=R# zYIzSaR3-PBvI6)16Ol&f()Zzeuod3I52T}B34wcr^LK8~^E;UVAVkU`X3Hc-WC+))24DglBYSYuxHN5VoBt4D*Db3DEl&E)j9D&npxM>6J!7dbNSW4#+uEp@yy%7th5unC(T*9@_=If}bHiA59~E zruMuY^S6+QfBVl$Q$ds^Qu+V=cVM{dE%+pZkGBk&xUEcMp{S7MRoNn@Vb>DvevbhH zGECie-AX5JgL#|AW9!02-N$|w+#Z79Zb5a+kqzLu=673bl}y!~J>LrLa}|ghhGYt{ zD}$5Z7ob7C59Zw3Gd*W+70BPo4KSsfvQb(wcA@f!qEfc1H0$%~I1aJStA~MH}vOQ?yCm7msp0#4uJ;2}tUe;1R zSYVpJc?2>#u9bhb0YQVn;_)S;Y2)E=&1aaD0|I-z3eJe`8+WZix3yq7K8F5232TZI zMXnfHt6_Sd_*{cDYtdIgYW}9M){^8jh73AJKvz~sXG#Vf_V($~on;ngxXRaw;Zhjhl-W>5`2MAtsmz}VvG=HMf!I7` z(GH~3+~eOn*SDw(h|J7*Vb)V;lESf)%VnMuY<-$YcomfrmzdDFjF(4VjWT7X)^dg$ z=S>wjyr7pw&`SY*{$B|ukq6k3m38>}t#VIEK>-yLxaHe0rcNLBI>@<4FK&oZRlZ`p zE4^5P89cS{`QR>$J8Pn}bAwCA>1)nu18qbkaXQPf#ay^~#o<9Z#+l1=BQH0(F9H+$ z^pP2O;9rlUy>Mdva!9fEE8*CTLZ))6e2sNJnqTJr7ihcYs))P`D2s;4UEAlWf zn#^1q_NM@b2LbIwTxVI&xtq*l+%>XAg#d8 z(ksMpP8M5y_e4zLG}Z?7obj~R`jV#&hxiBkSXsLZlR;E9Tcf;h*QjhOl7pLmF2{k#R7tUO0ts72FMoCI(1MZ z)xrfG9HM`iKeelcSG18WhK=U)0Ew7qlK1<)1f5O^<{iShs+%t%u(PQA3%O6sJaohg z{t+Nb^AFX;$bp%wor-bs%4gv%^`;YBFeycW1 z{Au)UJLd@gT4Ml>^tZqbvWn&9356(rx^po|sGC>*@?y`mD9uw0D zBYy=7bHHrWYL~3j9T7oMBE7R_h1deds0GWE9Fs8j{So$G$4rT_qk;_Hj6oHoqO{kF z3Xic8QgE}FJ_h(eaRj+{dDQ#a^!3cMvVr)3E~7ogXe&DnY+tylxu$+_<%OcGpdJ}x z7e_=#NUepxq0C;kXnH!=VHmJzivduZ(`aT#9HJOa%HBz`&a8KttMIsp964>^&vH`q z`&dzy%T`s2YTwcINye+{r~xk@VOf#EML2?i&gktw)xdQYV82?~=a5yO^)Gn9L%~4t z57M#$#YLkt`&>n3oA+#k&t(@DNQ!k-79KM?C9W1&)y&yul8QEApt0lLN;xB5hm)Z~ zRIO4~i9#c`@P?R0ST%^I(~V8M0R&TgyXOy-4h>3f-Tdp|*v8H+{PQ2YC`yCn0~GRo z*mDg9X)0wSQnEO#N~0d^RItFRZ#rx4A!q&ED@F#ESSiEWU<%96x`cHbdAtn1EKc#$ z-aX)aI->-$8=JzqDnd`u08Sg(x%S(&+>3BaCvcjz6~vpnV2BJhGdf`mwsKMldar-)OeQhTw9 zK~g=VgkhKX6;p31B6B3HsZMIKr(;VD3^@$c4boJOpN*0AJ6RxL4BzM^BNL&Z7YMl- zx0*MXzBY0zF8z8o#6*|Y2Fvz|i7t-KRK&?sc5Ai%su{eV_;cvx?Jx`k*G|!UtWoC12^H-gD zx^n6QxvoUh$Mb)Y61?f+CRiPD&vA$Kp}BJo9Z6-4!w>>r#M##%7JyzMyb89-sCI8=Rml(vsyflxO1EncjCSV+7AE2~AXr7&=3)7m&*;}g8iK{0SG_Agu5)DnWStlYK&<>Iq+c>ApT^1M8ki1u@RR!AinwuyiM z0xd|R2BnupEggtgOn#8n*oIb{kLP(omggCT+a_4HT$yu4{=#p8>Qaj|0uD6|=Rf#= zFaFkrO`)7J%(`01Afxzvd-col5e#@Ay-d32dBwL@5L|i%Wh3~pE*XU>e9!)*VXQz{ z1zH97WB8xu6*EM*zhY#2v?mK9sV!vD8iMgNWpbh(F^Mc*wi(@HBtAs0&Mgppbx#B> z!9b>Cb!aei5_G0uaZ{rL-aMoFHdE0)-ipkzf!+i!yD3*!BMM){P&BrVdGCOO{ng4<7DoKIdUnNL?-1 z{ySpv*}(kC8N_t+>7##+5ZOQ%L6ut}r&jZUDy$UE=O9cg`P3aM&EQCa6g|)mA&0fp*kZWV|bTEqi%u#)?QFABK zRaD#y<6?XFgq0PvW=vM#CNxB9vDhUHr|J|lOjBZ55ZrN6>z!qa$nL==i`%|)FG8=I z^=a(QA)aG%hxLj9*AL)I5eVwDK{Y;HFsxBuvYE0m$-?&A+`x*5qrKKz1=)Ej*u|1@ zb*Y!cA^TCec$z$Os+IbY{XRxvpx|AuPAo|K^X(Oqb^?0N`aJ(vJkJH`=KRwZRzYtm z$_5$E*#e#rIiYC6NBm$o+55-e1cSYe0QzkFWG1xf%-vOc`KPi`ZSm4^_$S|hfL=h$8+MIu>9 zKP%@v$mMyP?j?-H*NcNW_y(1PaW#lf72>&8<{~rrXn_KvlSRebYB#nLk(BXT$KFm7 z91~i1_%9Y06szj?d034>z}mWrDD-AU%`*OF!HEcEEUJY0?q+dnkA*}tvCXGOUX&hI z(pB&mN{ad5{}9%XhvQ`~gx=11fiK>Fv_8W*u`vCqCId}e<5%EBb+(r}LLn?kI~#=C zdtxHkm>ImfO#^#>jl6t-09Bvwe*92=(aIyOyRF0KD}CY;W(ifh?|XwSIB?UOvmHvU zutU?uO-VwXP=3D>A}d4_C7$DesF}OsZwSN|KoS`Mjg-5!E zzDta80x-J`U_97js29M>-lz}qmZl0?oj1ZRplIK%zD%YRMrEQ-RKZd;`AZx?KnW2- zOw1@i6)-F_z7S91KO)3MFy@8{yCg@erJ4nBqa{Qlsu|Y6TygS$Xn-Mrr zRcWb@i9v+l+ZS?vhv{I1jaILN>}05&_`9@nI*VL{#Rk|e^Uts=hRNOpA&tUIYcK3b z%-ODR#@??ENTq>f(!czmYg&ke~cxP`ias@-Q!=! zTF^FO`Cuzpc5)A+TV}oGqVfAsSs?^M4wIs5v;_np^raT<=SJF^!uI{`&#qs^yFd3{ z^%dw2O*-&?_(IQkPHnk1)DYm*@^eYZS;+l;fDz+1xvGRy+xqmcbg2NrnfKH*H^tl2 zOB3|x1<>P@1?$_$nLV&piiV;49xoajQ5=Gy@jw2L?ON6nx3#64pLP!Z5$Yi5-T<^l zTpmXjmw=hGR(+poENWyi=Y;?(AOelFvKwSJ#Q-6|QQ))SR(gn96mA)A4jM_wLzY|%%J@mWeV|^toR2=8_!+uNl)LyKfHV zHW76nFTXR@chfeuBd~ntlE5|N%{T)W8eTfT?)HCvv)@nYZYKm>m2NRYfspmn(d4@g z7mW;HhmWGuW*$r}ZN}x;RgDk5nXwP$gIlLuPrN9@$l2QWsZ=e5IFa$;!i&8x?1`&{ z9)bDxV<^HO-Nxgh3#JopMoPp0ys@))IRBp^fp*(wFagQ~hA zvk_L!Hml}g-xOUzf>+=*8QI`;CdmnIY)-uLAg-Wbw&}XS&e|go?X5Cw;V101EE0|; z1nc-N$54u`-lASs#sSe=4;`p2i2W@>fQY(4;JCR)WgcXpW8NESuPOj`Kw!Q*`d(ZBIGXW zzV~QP0co5$wGhU#@j+!Pg|`Bu&Z7T_b^fM+Z3GSL-jXVBL)B|BAO3A|p^F-`t!Ny# zSkg@na-VecM?EVh8}kU-77gj7-86%ZHpRWkU7_NsZ$wuM4TsvTAQdF@_+LMkFt@d7 z_3}DZ{rwGamRE~GPF*b=LO|OtI9{Axbt+RV4)@w6lWkix;0yCxo0;8C+j%pB950GRq8m`OjwtnPSrtslFc1PwTnq% zqXK@VE{4q7WB4i=Z=}C`7eVUm?LV516L4-b_2(K4S0ZGHiq-_Jkqdky5;f ztpr(hqj=Ah@x=^c4vUe8Ej&7#Go^q|F2n88g17ZY%MPd{`myz*b}l21TgFB7qWe}u zF06LMlgB1kdo#k9j4QiX_A^WL=4zE9#J;FEp$%6loX_E=iwjkK8l*6Pmp;52F)6cS zu_2_m^(hCYtRsiM{vag72=`Ld&B?Q{Yf&SBwqEEQb#RGh^#1)mcYv4EKg?hf7pGq{ zYA!e#oXW#ZF%-gI6jmaG)Nf!s7tqGkudopUsM(zr0T|0wHtz7xOA`m=aZ{mQxK#1Y ztSsV6=Q<9I^5${7$14fg4__cWa|sZ?Lz6Cna5x5=LWxzzrwAqN>~3D?BVmNMo;`Y> ztsLVmiWTyIo#fOBZM5qao&rYUY!mwJm+#d~xiF}X@-=MMeN}C@82|wQ07*naRN8o~ zr;VmoB@-rt_$6Is7D_zR4Xf4Ryhl3lPHJWa9F{>H6L{(Z^*%7~5v0vf6jS%z6BY#- zMq!t8OJIa-o%YsEuYmcuACUmiv6^0RQxyoMB3aGsV7v8jM>2CXb~Na$8Bd+!+AU9$ zE`GdukbMbaiP^aX(#86n?gn`Cg!P*P03x-n_FQ{`D!fH6ku^9UAjZg@xrS%i%73^1 z(8L7q0++i9u}dn(qLw&fAfoDUMJdGkG0m#3{R?p&oYx@ZPjS}}PgW7zNxw57GM(Jg zW+}e_E3-!k3ek_Jz?wT)_?4ecv#%_RJ>l&?u19j@0C&Xd5f7Mr{)C9XR)Bq)+%T44+&X?3Sm%@18RECH==66%X3w$Enw+}>7Z8b zz$pu>xBwu+)+(eHB=mkRXwqEsfClj%A~F|>R?w#6voZrs5nF*3CNOoY5IG#l&-YAv zQSiCsoQuM_E?E2dk~L5iP1@+YpS9Q%pua`f$3}5UV8vK60{}m11-Ov_+%pUrmZ1J zYRbQ>6t*vZ(}T6G_rNaemf82j_`Ll&t_Uxh0Rqh53JI{i5s1(-wZeu)LrHq1+aumO zUqv!u)Udt%_K!JiC#Z#c_Hkd9Bg+UPrgT-~?CCfZ5>CE`EnpFly~n%-kziLw^a=Tp4OS>YCeIj#k_$su)r zG=~7VwX8uHieLt9hUnWi%+Dkh%UCU&w*<$+^Q9l(uG<<-xkkdU;9GeT-L5ZxdmMkS zFz7*X+w})N=Hy&F`6rDYfbF5CERejaod@QofPep%HYg>-PB>&L`z)eBz=m83nrDDO z2+Ig#866JJB?5-Efl^!9iu2)AQU8-@&I zoVy=xj{*5SM-}tOMF`O7Thj&u_;74f`~owLKVz9<+4 zY=P69@_C=hvI%So8=R#jyb{uDkqE-uvW4>2`4%@kf3+-5A%yrY8*tB_t7eeO___T-6g{uu!vq4)YSRAYY=cti~M3g zJR;GhfwjKR`gqF)Vg}hvyiy5l_eI)*TS6Drd`0yy0+Rr!%c;1cRo0R!0AWl|G-gvl zHjucsv)QE@>*&2QaJm~T+@-qvt=8PbkY5k6I9arj`8#K0W3SeuO@%--SnDyAyr=(0 z;mO|_YfnlBHa9>D7MLY2G>g;$q<0r_K^Jgy!C2;XeF0$cW};>_;_NP7y3VGMy zs0jfX!@%U-40Nlr{bq^u^qd8I4aaVOTpA12v8ko;4+JbxS*1a>uEftz^AYY`UJxU#8*H>FG;Gc6Tv-xu_3+fg{QFcVvlyGMv{)xtQ5L;D}cJjDxC8mPVd{AUyC}_ zt;T@YtYfusw~^G8B8#tCxkQF0zewD=e@YT;pxl?=Vp_{y_7Z)NLxB)~>&PSYa();^ z+?$#r1RCiq!P$(wHw17Op<*~y4aDe{h(b{`tDuWN-;=TyDdF={u^AIq*Y8t~!o|=D}G7P7_LVm5j2MLUn~$Cb+i4Ep6p_ z9;yzbeV0E_*$5k(BJ}#+1+eScpGoVl;=7PjDA-6Ds724Nl{-xFT4+4cdF@FqP6agQ zn5?RUcye*K#~ez+<-*wNsr!g8|v zW7*nD{_Q^}!x2FaNZ3+OxvxcS+Pfd8`1UQJ45Xr6L5zW%D{8gq7(z^rlp5uB4?`Ef zRc6XoIN!hK62DV4pg5J4=UBhk9Ax)Dg4|ej?y=AGJ)85_agp~-R56}cTk{yH+X;j{ zw}<t&sEY9;NabuqssJB3Eq}TFMe*Wqt+_M4To+iyHIkd}iIs{x{kmpS~oD;$| z=Y2BN2A!XUx=tSJ?Mmv_+D~BJMwAeev1O$k^ITGPdQ;(jewx{|=GBvkOzgXNzBii@~s%c$HFOw|aw4TlyJsLWT& zzx|h6BP|%wh~7RgO2cxJUJ_PcgHignl_>%vs|Pem7sA|z3$CzV;vWJhxa=RzTDqh` znCE4Lf9@8cY=_|-Wh(Bar)#zK{b=cV_PKN+fc(3ME>7>SjzwQGWpF;T(w?DfWu^c+ z++sMaaQiLQKnro>A18&ug^1n|?`H||EiM$0NzQ6iZ(Z_t_;&zCQx;<-vM`l$z14%z zirGuJ-D_2%na7n|g44^se4 zIgC}A5BJK+6Jtx<-~OQa;kTg=?cW(2? zR=yn^S9Vp?Q~!hn-J`_4jr|R@s;yh#JlSVb7dZ*1M>X@A-^>XIozOjE=hXs5ULs0c z%feOY+FG{*hCnS-_e_18{VmGC);5Cr9GXn{=$S0F=tELc-#LZ}oeMqDhBB`7V38!xwIU9oHNg*3By|GHmBD%BG>%)zD zmurHkMv}zIiI(Gs!-p}WG~lLNpXYx{dU+;3v15eeFw@)rbBHK+T<@`&--8v!8#;o@SQ2i;#@mbix>A$?nca)Ig9*=Kd}!9Y5&hN<5g z1h%N{p7cMIdCxuTjV4;%nL~&{Z{AD*B*NqzJ^CWVx+WjYLq-IveW zKd%a6$mb@`cDRXQWKH$aJtr?O33SB}wV+G86>Ik4o?DrmVF3yO3DsPb)RH-~3oCar zH$c=X=Fc2aPBy79mCAvQXzp?Leo8Phow~Qrue6h@T9iYIgbzG(z@j;pCJ z{@(6J=WMnRApjxmSc(uNNPAQ)I2oU4BcnxHW z1fQU)5{5*kmVV09Y#{CwDgH8^vzLGiXrI|3BO%1S+rvxMzXC^zRS4N4P=T$cmF^|x zFk0gqKc5C;o56m$VjRIZS5^w4V1G#S6;VM2!K9I#ozx4joG?T$a*cPFY$HUO=;+}p zag`#etAL(8>RpIP_4mBjWmmz>T`x04+z2p;YfGaTF=9VmZ!U6WtiUk}& z%KS=S5!^`;h+@@#6T=Sn|0}e~r+Hb!0*ObFUy3ltfZITP%Z6HkdwT?%QR#GwaYFqJ z57%TnmED+ApT(}~ECJ*|!b@!Pj|yWwy-wLXH3!Y5H!p^)CLjRwFke{v6S7$OzTS`J z0eE}5&q95P?tJ*uhlfWXMRJ|mdGr0cdQJg|t~`aO)c^ZJfU3un07P{b_$(sE^ym45EoW0a$M{LLt1Tj9G)8T4>SaMzN#{&m$6@1-+QzxRlsYnU zoEpRw(Y)G(kl{@bFc`WMIQ+P85oEeDd{CP?N_gtG8u<|wEv!YIUoN!(t{hl^!ek=x z00Wf9r2e@ICF82ls7B{%0HT^ss_L98qq6_@pB>|Vwg&UKmzd90@E%54{=NQ@?|^uV zLg11$pbrw0*h1gmRHLDSj{8a%5=ZoWvI5~PMuE?;yukB9b_a#G=Wmw+<2jh(7M(B^ z7wYP_koye__6fWlQzmVkTa`5_I{ca5y)~aCPVs%PVZ-CcnkR;TZ3p_V|C0sM_cMN` zmN0slV$A+3`>Z>lK~u~e{`5&i82!f2v-%098P$zXiHNYw?%LKzkV4=Eu8w+H@_szX*xkLnev>$kwv2pg`Z# z7ep#tQ!w>a2l=O#4-GNfz^yG2)@mEEuFCkiZU5qcXAOk!^;$6)WRF+ zH7tekOB60Sdi(qH?Ais1HJyM^0baHWoU9J#kmXH8j#Kj$a=`xL!#WJkVz5DuOZ}6E zhL9enS=&FAeNW_YDXpN!U#wlMwjW$6rWg=hp?L@v%q)XL#){$L!s&fDyG@WY5ME-K zioCl>uF9@;g)Aq6DgX=s`Z4zol1f7Nrs5PN2|wMzprO&@<28O)Hg91WiGAR%#{pc| z29gi9vT4+74nqvEGJX~R-X;CFSNq${sq5m9!{@lR-hw_;I;-6K8*k`0yuaE6j>$aq zT*?mqkN+oa)dtLnYb;Ep#X2a$N%={*HV5j&@}@|Mp+PmRaw~4}v8%IorFY%b>~?ETA?nUms+M=tNinQG}#r1u1^(1TcHw0_1CL2Ky7)OuB>9@ zOiyKn^~1NRw}YIi7saJqS2WWj*0%!)W3hfx%J}gz>@(a0zP*Z-lRv?wxkUk^8CJ?ba50%*o2ve?9X#H#>l0B~X5 zv6$%Z#@!sVmJTNb8zt6X&Z!RSR`Q`8Rb_WZ=q=+#>Px1PtO!2FIHnhg zaLk<_A@V})8-Fi{d5Xe{oB(?`O&q32FiYUXz)U*_zIHx`W19PzKrVP#*a#l0>=((r zc@=aauBQw7yBoHfX|qZl-A#ca8GJUCaMp>};6*|&O1l@a{{q%2lBJd})57ogiq#Ma zMmER-xR8ueSVdgE%>!!qXsA{wsaKYMdbmRGl3T_;54 zRZVBtzs8HRr!aFu86X~Z9jcq71uHa_ZPw0PxIIJ?J{HFBGF7ZtN z+kX|lIfU_}c;6NVwSuvFE(a{y`=Jw-QTHTf&f7y2lhFKq^S52|+;TMNGp0@chyT%S zK-#ya$}C<+3+esbUM*`;42FvCiDd`@|A7zxxL;2=Z8NMZ7!nq#DbATPH53LoQwe&d zML1N>l=ae5=q3jtl8VJSmf!Omc2y3Pn&%Z4p-e<^g&Un6@EP+5)fSBeZj0U?MYK3z zino6;P50KiOwIEWZN!Q!U~9E3uBrTd8f^ghIAp*u{#|ttWmO@EK|81^Ecz^m~GpfWcBe8UBp>O3KDDE$dbrx1(Evq2Vzi zICIk3HY_Xyvdvd@OSZsmSmxWM7v$>s`WfOEE&=jjI7Qg2Hm2+{pHX#e(~w*&KR zwv?b;wmABa5;u7z1>1}ugIVaPAk^rigPHC80(QuuMs3r!^K+vn& zGRp4Zao}2)XlNlUfguUeLE%=}vgB+wa#v=W+1+hOcgc2uA!RY2fEd6aH?Sq3;0}IJi~bib+I4;bqAB zS?T`WcDbSrD7mI^J}lQoS(}sS#mS1y^L@r;v#0)Hg()oQ6GT&&=u4zHMb63l!6+-F zmI%qMVd`hP{~`?uViJOJMq4XX?z0SGZZT;qN$5&ADl--143>(4?k|6PSrQ-citn?H zX%1`Y;)2`ee0KK_St03|My&|!{Y+*4rp5w_dVhTu?h7fu7hykZMhgnVUQqYJgk^9w zjnL2@Z+km63_pJ-Fl$maLajt|r~?1=Iowv_IbWzxaQSs1a7H z-k@ksTfCtDVYdhZ425G?()T3TeYPj!lG8YAK&T7GOocL+?xC~{+WxoyoGYfc5QM>! z#EN!lYFf+u4At8FoP+0WxEv;|__hKVW#AtWYsyd(-GX8{Wok;^s(U8lFC_xJhh81d z{(>ttAVCZzZ^7&ZS0HT=0&&GK{`=9AXw+BbAkZ>DzANB&j@;*WG0*qszNB=?GNBh@ z(#HK*Q^ak(?Rm15(9-P?yPc*1u4qU6%&h>M1B3IqXFvlK{X44={_0t~5(vMKr>z2% z-)11jynk0wpf5$%fT5N8KCY43gUS@IYC8rLQx_f7Z$G$yO9QU$=r)sSY;$|L#2mW0 zW=vU#aK4H`lFRt{R!t!_#6OWJ*)%DaN$fAS7KV{eHb5oOAXVp!sSpZ6Fbubc3W)%+ zzC|v0E>NqH7yx|BJh(KqK$Q_|*xRcxweptL!)VHR2I)zgKzuwWyhR9^-@BC*%^Fvh zXN2++i2`a7E&hZ5NovzZ$Tg~dO?0T?(v{+^70J;&oAf2Adt%ZxXF;Fu83pt8J)&gc-hK=RFR8_&P-5C4tBj;_M_Th;GK;mjx zjHF%VwI_Q6uS!33Ypj~bCQCquR;oENb-+W#cziV|bJrU>AEl~hQa+jOfPshI0J|fM zWSIee>@VAhk@}h$wNqM*L0SQRig{yjbF0e94Y=x@g6O4SCc{G~NY3j)X0wZ(6BF#} z;3KZpls6C>AXWQkDo)99v2R`d2GAhPZbcWtF6N?#MQ-r^0Il}slX>n8tA%G51wO6P zE9y{i9?MAan}<4h|5ynN-8e!?1_@B9m#C;#Mid8Ci`l%~6MY=sP;Bb-hMqU>A~`GM z1Sj1<8k(XJIKp$5&L&?|2$6_C#}HhVO~)OKr`jYz zKM9Tcx!T$R*{NfN0&IpVa`J{m(JpV{^>|sP#ul8p(~KUZ8iNGl7T;vEQdD8%Jx${I zUyedL%YaW~TLyv4u*&hwEz#T$?ZJs>RXH+F=}TQYYJ02F29V_i4`TB?|cP*K1-y!v!V=5mEhZA-ae`RECW@ zmlZbf4)SRE0;BP6K#0_-GF1Nx3nw&UOv>R_CCFG=WVrwEH7xUt$&H^T)hFbv;;jiv zk0%bYHJckeN`>5qm<=vzzHC4~EV4-g#ypU;**bg;L^`}HCPvV!gDltJ6@!M*md)V7 zoEX$uCI_g77v)n-9v0Zu1ddDz1|9LeV?c6_v)EgeBz%DSwbTEw*KAN(NJV(^TVE@< zM>_~oA+enJHVVVQP&Gk`DY~%mS)B1bG%Il0eP~QwpAV8>Q`K2!H=)8QGJY!F%_qFa z@s7(03ZHxf7gs`fip-G#?Cz~D)^3e1o=9WZD!-2;Lc9p}!`W z{RHh=7KjHZM@9qKR>X@+O&=2RS{M9Pt_$}k*?b8iIDp4t*Cx5k0UDgphY+Z6W2oZw z+?6@evY^(2*Ws;sD$JF)CC{aAq(aT zhvmg$rp5e7h`p?3JQQ(_X5+S>a338OkSXxI&NS>{;r;LH8oZ|r7vDO{#ghk36x3^| zOc}<6|Jt3oCeRBNcz5G^6vgTFfrM~5k~Ot+I@*2P;aRu9#I$CBf5870bmI306BLB5 zxS7O^oQk1cOt{2ZAUOL|q0Js9Pv#a@#>~A_%BtM}_Rjo5WsSEJ@~ z8RL`OV{sN;R;7sWc_F>)W&Qj5Q(!IL4p6S1NoAX0!e=*vyg-?jmPQ%37~&l~tiJ_O zn0Nr(DeOj{hcWk0*5d2m>5$944ZdzwdaF3UER)6p@hm)7at4aoO36XjRheZnc9|Pf zMfO&~=h`mBPs*icH~M~VzW70Cg2tkmb2>UJlmgoS!|eg32wY zw)^Te1(~0d#)|Vx)Sysgk@0Zl`md4fPuk}$K71~R=j}fnz~}obdGd@;l7O6M{9_Fkm!XO0Rm|g6LT)F@59mimZDq;6y$s|% zFcBy=nc|AmiDj0JUyJyk=zph3oO3IM5Y{von8(B%6W&(t*`8mYUHz(~-=n1qD+GN} z!+xK!KJOl0p~BTKZIvJYby66gpKG3TsVnXL+vPmXM|Hqw(QSAMg1d zdHO!D0s+V`l2-2r{sCc-*{+|B=_Y5}6|A;**g889_j4Ig%lLUgh|zQm1kdf!;#UM1fR7zS zfcEDm7s70TNk!54thj__n8G%&3Tdzg+VD4~cmeU}{5)d;$V!?B_etn~0!&j{02TKo z+$M}3jaVZiK1oWeTQv~lUMyvCn%;6xVHNh=gL2%TQ+z)O$QzNVe<=?$lC}olu6T0` zAa|b)mNr4g_CD?;MPX?bUnO1HX2dC`ezPxai1Hd_M#d7L*4ALN&j;+gmCG?e(t4UN zp!qw6a%yFuR)}ebQ#6)`jTE_>;@#SlgV9q!dxch!-Q?~XrPX7elvSGjavCU2@dZ|y zwjh11)E<$_EaVm}KwBhh@WuN}epnBS2{{LZ79$Ay7_K-wFz6Q-a*l;$nbG;b z7gO!1Q`lXddHyO#A^HQ71;92_GVB!GOFb(630W(FCpBLPc07GAN@rT37%Da_R7NRA zE=(w6@Kfc%Q%B9t!<);EYBb4fjNK*Dp&Y^pADUr^!5hLK&N6An)dCAf?xk)KFqz)- zS*R@=k(u})kXAdc4+injPsJ&!)*0oK=;szRRqDnskfy~@mJF2z zEmaE_woX)@51Q{hPgJ*s)Nu6YcD0H0pvgWj4XBhA*Bg0s9th=n785*<4Z^7nW(Q}9 zpSb|4#ByAvf_p=npSp!JY8{0_h#PLSXkbbDGpqnpR2g~!c)&|kycRkY1@lO1RY5x% z%f!rW$Xz}Qjw#l*2LY!(OxSw=7H+?oK-DncfBR2y=1g5wA-A`UTGG=YK*-TR>7j-D zaCx8g-$GlX+w;^*g^L{5U$yV4F;kildY>t6!c<)mK`iz{83BZeJ;UCh79?D#SNydI zS4;r65cUwVh6JIv64EB2t?+%oD+K4VhXx5n$nF&PtUV?+c)ooruh9hv=Ud}vMU_9 z-HRF=Z<#QCGG*gz>b6nV9;|LU+S>N8>1|jD(iil>CG*7pf13VS)w-?=8ieZm5KTl1 zK?5745J4~+fgL*%F?8O6>kfK{X8*Z^-noJ64tnM$BQdg(5s4y5A+$k96$#pj!G8bn zJO}o3_L_6eQKPCxP5e^*#2P5>Y+n}L3*8ick_A_szxl~bB!Yb`hb!qiRmmO$3Yetq z;*SYaxa!%?iu#^J;=1N6m7ntw_i=iDdLWEMCo60AlSOS+R!f(oDQpXuM;AQ|_|q*` z*(zEqdY>U!8?$;HFeFxcT;C@isKeU`L#;6N+n#1L#YvM{Zpo|#Lg&FNb4tCggb-l3 zlb&Cv;NIg2OAE^71rYK*7DGZ_@&`G7yWrL~lCV}#V6%|8Xlv2-gRLpox3{*^;I{&> zKZm39{oT?LE;v=NpGmJPh0EVK+PCKbSDFf`0TpneAc(S`fL{6phGPD9znj+9R{XF3 zpv;3S`ReihitZf3@;=L8kGTjbS1;a%Qg^FhyPzZ~Fu*-NzU|aLS#E#-R!(}^S<0(g zGAsF~$qUfBF^8W5p802(uTlve6dm(ih`5gxhv@fA0gQj%Xe=I283FtI3A~NS{bHK{ zr(jcPzsHqx$*pMiV7u(}D|hZQ(T@m2qzRfH7J(S0Z`X}b?@w|p?3ptahFGO1y-3b` zw%^We5=#W?4X2d%;f8drkZ3w6Ynqq)H$Nu*tu=Z-UZ4lC<{-bJ{K(S%2k0sY?#HIW z-6pd?8G;oUQ52+XSK})b-xJ~%cR!<=l+&x9vgTi}ftr$qx4|F5T^HhVo~-cPSoQ5% z%-^yuBQEBti@KD!Nm(J^216_hSN!!xS^fMO){=9aANa3fgwp(+QKi|FWpU^NQ0uZd zI54!+?uEV&=XVV>S7-TFuN*fV!XMkcfE=CoT)@GRiTrk6-SeuJwZ9Wl(}d-7_u1(H z()U0{ap{;&!3PY7G;6nS(Y>t9`91)G7z%;+)a8$A=KJ^ktWb3^U?HiiWf8!%uChg2 zI17i7%<%Vb?Gc%{vMdIGiP}U07v)7Q*nu@d{?q^c!7ufM%obDTg5iE9mGrBKbHe@C z|5n^~0;F({!Lvb#2n^02S(f)KiDY^kS1z1nPze!4X2A?tak4+wbm=zT0XM8=WD3Bd zj54&e?z5d$patPhG_?Q-@v@fs6@yz^!CFOdW6&Cn?t`qQZE_CofBiq~)#^QD5XC+H zeK2iQV8&;AL!@p2KoUvWDW&7Bh01~6B2k%gNJ}O{CfknoacYzEvVX=209qfun#V!l zXMV3>0x@T`mVI)*0<3)&46FoH=unx45{4=NM#Bj083Ig%p9c#$Hm&XZ_a$^m0s#vf z?#J1-g85bZifm@982;ER{5@Y5`U)n9K5tgjVG5X)ZM%RcE zvUPu5lVX7;wm3E0XX*#EK=@n<8v>~eSzDYse+8~qa(Mf9eGx-0O?f|8SfMA9^BE70 z-UI)JE9lz@NPzhzEsG4>d5=qYpZi?aZPR+G&KxtC+G0&3TLA#W1!({!BmGbQpZ(U9 z|NaX9_(V&2bBq)$BKN?WbfQIpb*c3J&W#(YzAEsN2naKeEoU2s6b{t;8;nNekf`KJ zwalnS`h6D9Ko?+fYq>lCTo^|0XSGBunM{6sZA}4Jl|6ueO#lg3wxTK1Cs>FBw*P4b zLfOZJd)v%%3OLRpo%Hv~KVInl{MY{CYQT2)=(Y zO&t9RelA3~hT61?>$8OM>)_^-VI}A9Tz>vYaZzqx_EvyI3kXK%R>zL0g%Dw?E-RSY zpN+e==6L!)`9Bc~Tuw#t7Sq4q`9T(Ppqya@Y@PuvI4s8DZz{cvl&{(W6qxDUQo_`r zPd|Y;{FN(h%Bb2&b7_U?%3l6{rlf7&V3Jr3hqowvRpb3{(b zBb-t0Q7~czqfL4%AR{}^i_(1jl|^#F_}P>O(EC^kF+kO@UiNQ1=y)YvIFI3C{`H^V zKaa4iw5Q-Uas~@2Fzd1m2J=j z!sLfGheDPh%C6f%pah@s9*5fr>wN|+|IiyNw&WTj6}vDeh&Io+Bf%jME@}~|;+zc% zwm`Z(grBtlfT_vPfF@4gUZHP5QC}&R3Bvi^P~a4o|580j{yYEo^E>{sRAoU#s~^xCA3)#70QE{Ml7WLWk!&QWg^pK< zRqydM5@}N@z|(Psw|>&xHaJY*LJUCC7mv-_=Or2JPiDCllwN_X_aF-dI2-b}jFthY z-X^w&GC;V1Uq=k4Qj5M3xdl%bhm4`CTB;oQw?Q1)usuy%n_t+&Hf-8Xxaks7e48SxnvR1VOfSkXN?^$Y|-PWZw`Mc7VgXi1W;^O(1 zZfjj|@vj=w1ISuDu>lexaGTs3E;|f|XGOd<5~F*9zHqP4ZG_4~%aH$I{N5?u+U6j= zL`@B!(G{n;pUp__E5u|Qdlup@>5^$G?h(ILBUv<G(Fwb)SE}8nbmEVL;2n$nx z-|SzC2Z_Ta!cru+i+UoaJmHlENi0hTb2`nu?&VH!-2(*XwaUj2J=1A|O5G2jusHGrvH?=22~+t>l8_v3H@v zlLmxh5nX;pyS|=Gao3H-%&${v_MO%RjopSdyt+!9co{-jI6W@zLOFLnW}!3S5r&)H z{e4Yr>HxdrE?;x`fB#*&OQa+tQfG|?p?iwEmw~Xio(1EX7tDg+|3b`xpcLftI#(QT;hV6Kq+=WWl!~u=#ySNeYVhf+-JaJ!jP9; z39D8rZGmRRMsi>*kV!f&>Bv;WN(XotpmJjOrI7vCf2RA`y=kOI%TRnzB3ruUhUf4V zq5eMO&JaanRshLbOoPUrb!YQ8F8qUF!;B#RiYOzOoVoxdrs*zxq#{62AxY3MsYtP-+5#=?RV|C(-x9G#O8!lkNxPAChNl!0{vjuZUO)3TQhu>$qja2E zv~Uzm@~mELKnOZ74lORgV9{jK?4XyRv;Og&_47hR${n zlD&?-E2MgT9f2=7q2 z{&tJUeRq_gG1cG)jm{GorvCL`aO*8aLn~XzXYN4)SSFQd<{^gF7rLMCxgFK3{OU6n z5bFEc)Yew5!Q|X_*l!xxj>DYT;`u{F%3zXtf}f^70c7<0Y;DjneYWn8GXUZz@3VPs zFJt*EG7SXCnS*f3XS~PC+l)*EpV#MpUI1v1noa;T62?SQsRXg>7k9>6#EK?3np1d5 z=pPhdn0T&5ZOF)c8>m#WXKO9plYq;l@Y~+0<=+SS8f4Il zab0CQ=plgs^*)=9^=%OScFl^=68L!&H^%clE`6zhd$#7RU{hPzrs<$!#d&PNUAGp3 z{aH*zS8Z|@0Jq9sNdK|L-!Au8F3uATayHRYFvU%cg0@ju&_pMrv@K{uA<7jmAztx*?GX_+wX1X>GwTNr;@*t~Jny<^Q)lCpSv%m+m zT}AlXT3)+WTOX;@MB$>`E{UEr_X4WaH4TKUX1*wf402k`bN?auoo7FuX^I@E;E6tk zy@&qc&IP)vfz|fHQWl}hXeb>*4dw8A)UOMxChO+zDniPxeDb1ot>o2Yz%c>TvC0Tr zN1XnJwCV_|A;Myr#ERcQS#;r9In_VczyST;0(@_zs>rXHEkr8Td zZnVj>>wk-U2_AsuVpeIBopM}qy5pj3NkdF0#rDEofS#OMw;WaRa7uA29PaWW_?tqq zBhr@2xp*>wE+nxNvD$T1K47-EE%%-_Qdyo#MK|1yXjf6dl#ZrOp95M0GPd0~IT!0? z*hnJAuya^#a+GL=kU3-U#oyf3?+}2O`+=*Sm%e>lC3~~XMH8?w56B;#Py4_CcRC;8@0aJ(FIS4XI36# zL-`&#TNJ(l^A>&%(eH1^|0YYRh zv-t0gE)|$T)9>($@^2AWOvF{rwv!Di*fMf_vFnO-wg~PEft~MWfsF$PoSlrn7;|@b zvw|yMS>OoV=>1t`%ib836K(5QZtqps8m`VNYjxLFI;Wz#GV+L}bE?RlDY=2XoYyQ}lFxf1#RD*kwU%JoQH44Vy_JU*+T^UfN{VW@;*TfQw z`GAQ_%f?L`Nd25czh~3>M!<9qu7L6%*&ciV8RYqqpDu6l;#7Sjwk_b)#Mr9&H@!Tp zc_BOy5e$68b_R$MhCi2A?;!Ixx_r+7{y zA>yFuJ@d-g0kk;fG9hD;f&glunN(OM4nEmh3K!SW2SKAp~T}`!8*)TP?@d%PUbfQ zwmM}nIJ~ujzyYBsKFPHmpmXkkO>FtD{S~Q# z&Ht+~!=?>L24lL2)OyHfV{cFtP$*U{lHP@}$KO3#I=Z0=o>>#_>4$7#)$2$;{IR6o z1%_X8zmK z&$g+Y@@MMrTY<)6e1DWV$&(`V`O6TZ3a;_oMUpx%>ZI?T6gTE4;!VEmC2wVkm04^6 zJI39NJzJ_ln^_d%+XI5(Qg{fS;B8>}b1x)oP8PW?Nri%cLXWoiJqb9u6`e0@U1Djq zhYQqF5=jezn^C;{gr)Z#TO>8xKX)%{odDj1^YX||5Q#he{;hKf7fkx#onRY-7{Z@) zzI|BybOwdcA(C24n8{Ok3g?i*U8m`=NVd6K)Gl;poXs;91x0b#!! z$~O;qS(S{hjfX6gW$JD}v&PmfqEuQm!$SG{OcfbSY9$yDC4wZ#?xr95^ZwJUe%B-5 z^~tZ`u4{Cy<)prIghP%T3=FAhu`H$UE;1Fi)Y0&h@Ae7;Nx;7+wItsbVMWEI=>Jhj zZ$MTh%0f~ZFb`W0>m2S@D2G)MABQro?awkCnYaYZx5|uG81|JCw`!oQweu-7)j(_* zUu6mCkL&#r>H*QRng#W8G#FyY@BTeYy5l?l^4_ZDb2rEIN_GKXUgv5vo}DIkmYp{DdT$bGQV=eDxZ^IzcEOH>MFC3rknWl=P<>RwX+W+JV6CHlH)=r{0f?o<^kNIDDZ)eC)x zVsRHBkn9l!hyF49U`m@YXQPow1!n*m5^P!8*{_90fe7Cty!<++MS`hh%dE_M6M*(@ z>|=GQ315#ot>KZXsQjT2uY(ipgK=wCq#1PaGF0gZUzr+6s?9e^rpqx+qRMpc;946# zKUx_ev2X>459=~m*IoI1#6wki@iQ4zv+^nw-0(%t#l)jgC}>D?1r zR?SMaKs8|JAH8JdmM_I!3RMEKLF})o?I1=8_#}t`RA(;nE$sFN-kWf9{t8yc9_|uI zTVH}?p@bP_l1#3~Pf0+pw;vj+{Xm6^xZT9~6{2{tMGF1wDPffStR8fl-)o0DsS?CA z}J#||P!nEC&PI%EY@3gDzuaT2!3a<)FCYuQq`zyfHf1yC4t+UFtjk+yjT!s64* zZ(um5TM$<5MGRsM>=r4Kx61O~vy4~4M4zr!FPhZJdcCFr*sRI za>5k}L^RvKRy)pydcQX6U;j%ts)>eYZre0D9E6R z3a3^e!TOX&*uFy1J!Cy~ZmKqt1_18Cu0&#)-{r(%UsQfX+;$xG8pz3VQ6c|-{~z=# zGJ(O6qrkn7-oCAYaL=+EyREJB=SlS}@viBe3Z0)CHe!&Z3A{Zp4IiDeLcv zxW{u4Ey*yk44=_fd3iy%is5v6`%+6NTyf#&@9#Y;Y{BPyc$qe9e=p!PR&;LU`8@*3AV#{XEY65q9Od^;p$27_ zylo~m&)cxuRxM3(ghcM07E+s(d*hFn0mB|w+S8e$A@``>N|>Kx0K)ydP*i&hTCjBc z5>US^b9?9luy7k>Wa_eq^B8QbU4$W+Ih%QYI~=&rEhx7bK!MBW+tEeWZW)%@BPM1$ z^f4XunwkQ%Nd@Afrbh%LBWL4U0a1dwy;j-f{`ocMv02l(WTmo@wG|@$LYa^IRT|&p zp66RDI%mQtsimAo0$Jwo9P?_QIh!&A*XS`|XxxT7P+x#heFbw^0n^^+MJ>pG`b%2# zB;4A*BC==E@56@e|j>o*_8+C&J+1+fSIUA?XdD+P(Hpf1}+x$#!?OEhf5(5A@y`Q{A8Mb$i~YM)%7 zJjW_0D~hwvs3M!%z67;}ysf?(%8E&v`cU#*7$l~y7}Npezn6vGQ((kXqy{`PS%9X(Cy~vXx+D=WPmRrTYb6=#(6?bRA6bPm!br)nsJdhSfbKZO`J2- zD-St9#5pBww5oq_wd=WX7dC6ah24_ro44Rb1hJ`F5fNT=`@r&|S-Te~5X9M!2Cs>% z@Fal4pah4qOiTqU_X4f0kzvC1zDOK;p-rKoDyiP47)(_hH@3WLBA>#3*L9v4p9kms zu+v});g>gikQY-O*?+{sXD(mC5sTQp#%mvwgb3io`g?RVC_%N9|GfgR9B>G)mkVS2 zy~KY!{DtbwOt`$`Mc>{1B>!wFU7mzYT@_u+=k54T9D=!6ZZ+oldHc5rj`k^!zS;ZYX!DMC!YkauPlwHdJ zH2#r97KUB==Z-N6I0{Zw$^wl#CdM|>3w-$^PRdE#$R+HYRH(us8PBQ1Vc<2%I1(tU zOc<5VRP~qlY(i~(BBj19vH|cpBTe~oeT4`0szWllOL1o7lV`0uN^l#vK{B;j9TzAb z7FAIFG~7I2|9oznhkQy4R2_ny8X=G!*GPCh7Kyx@sP*q;HAT%@mR^>kJT5mc$=+-F zHPBC`@H~89S8M*l3R@Evl9Iv4Uj~Flh&!ti)we(~c~6|vYPU-8u9=$0darStc@u;s zW9beyFA2)rp(AgWufl=|UL5jw7w~?55t>eG0aZ=!*(4EBOXVGXX(?!cU{i5vnj8>sEYOsTB1i6jQPek+4idqMh~y9?MEJ2ZiX$_1Gs zInS-XnWU_($O63F#fuZRonRqjU4YZ;26i5>zUeU3t`;5@UBMo_#(7Sov{PgZ8qlJP z<#}G%B$husV2EGTJV{;^kFa#To6ay%b&V@vlnw+qec3b-h%jcz{ijMsJ68|Sag*kZyU(Qm*PA! zEp~n_00v$dkr|T@lXI=08j!Y(N+~Q+xGC&@`f7J&deb+smxe=ud6qek;F|em#k*?h z^-c?yg&Rs)ju5WWIq3f!qY`?pVa2v3KP>7$^+ZcAN7(FSFF|h^EbO(GJ zM!Cjqx!w+XgI>@|TI2uz|9Dxhw|fF0l|*Ix{O;?^gsM1%sEuje4jwN-ZkV&77JhDD zVLw>p*L?F9s04IE2DPXxSs97eU%8X!PDiP=f;ODOQf}Aux5%W2SyCCXuZyVgQd|=A z3=^kFVtd+Kf>Z8A2>cclK@>*A5W%_B7CaBA-TiF2E-GQ}2X7TUY72Z>KthEHx0!AC z^||M5d5QDY=Xv5pp>9(fKv-$e(WXXP`-3$A`ApxBy}jy9dcY`%TwA%H>5H5zZ3Ttz zS(0woEm{*LX~r5=7zT**WX==-TY!6!f|-Jba1AsEi!yNDBFy{zY~{=rqD@B&z$cmG zm;$%bdz+Ta!l06J$lJ9)mRa83`)4kCrQK>@)vlVJc3Y0Z#o&ZW-~t`6MljB+2pRG7 zhKPnK(CCYE_mAKH3ALv283tCFTEgJlXkz5~jj?MhLb!tZjy)y0vZ2ncsYE3P#0D zZ%En^phWWUtW5W-nltH5tz0ONjFN?yY`y5=&msB!`&Jb61*~*%A6q?dJ+Erp3k7u1 z;0jLnSAtQy>Lp_8SMCzRm3{2Kirck*F3KkWHmC#&VqyMfSO9g2P5^Vy-)Du}0ux%{ zrjk%J5DD;^jS7LoT4)vNRw5vDg-%VFrY)zZbU(MY5Y$&7vo$@n54SdL6LeX(x3lD` ztlVc$Htn+Wo0OjlgK~`kGT{5!70$T0k(?iCr6`-S#27NUqRA`>a-Xd&8gmw=U`>(t zu`Q;y0QYS=)}Ux{x8=UBe|OMRAC7UllYmR_GS7zBbVPDFRtPGIFt|sGSyY8|?YQ5X z<_pbhL)F4F6(~^7^-uhSoea8SIfyz&)ESbCjeA=U#t_6JtUnLIkyJy-S_ucxDAUJpm$*Z!UpTi>JnrtAFS}M%}wK#Tc2v6k9k+mCG{*SDV;Be~N?1C`-g^0hZXbNv);9PLOAZ z%l5XKir)*V?!Cb&&aDH23ur5+E1Xe_EI7x(-fpf6iW?SGsnsYh`O6HTS=mS=prH|6 z2qSDdOlgrhU-q8;B8V!i{*S0>T@4cfJSpRId8;Tqfz|;pDKoluw zi2Dfw#vMWi78F!SghOGkAR9bIr3eMiV{Ps3=r8WV(9~YQ5q>I>*P8FI%TFBU#F!BO z9{r7TOmlx)Z0ZGC*uD*E3EaDMbrx z;En1G#mu>d&dmb^Fs8>((`t^XG%pq?d(9t9t*$ryLO@zO#hGfQ?h1EnxFR=s>9qbRpwAOLdSe&*-5^mrKhEou|Qao$#0$DSfHu6(0HB0ce|8Lt8RehdTFLWIu~|-{M9Rl?f;aRo ztoRPatRt!dW>>A?kNZ8J*5o#JCoD(7ypdt#AQ$0+a;+Zxo{EV00dIAMo5&abl|`=_ z16V8|n`ejAh@{0vHPTnc@O<;=fA7 zD{y-Ok}rS+YpmaV(%qPt8;UqA+r40P&Z6u168D$IAaKZPkuPiINhcv3Rybvu~ zTPrg%aiQ?Dgv19GCN%&=K)b(zoGnb{vn^o$EIr_BUf;_u7nmO4T7(I5MY%H!-|FGU z*wG`jaW)7s%F$9P3and|?ANdRi+Zu4_JG`h;PqG1anbK-{mm63=02C(so4{~F^Pum z%3vN73XAZAHixNT(B#rszpGMZmjyj$9j~JPBU{2eBVplyA7Y&)k zIV^jx8p|y}3~G>(R;?D+X4$=Y?%A_ALSAc15^C2E{`gt=_?U54hY+;->lXnk`rF;y}bKFqzFmCZLU=YuC7C5P_uiy!DZUk2@GfiZPg*oZdPMlt<+;OCi)|q6y2p zDhOJb&J@mIS3&f334N!8jae5_BR`Nwh*0fNTjPb~9)iFoC%l4LE{I`iwn&gf6^P_F zlfFv;{^z=(t{Qm+g1I=lq`}%AeuYXe%I~?C`h$EUoYFi60cU5io<_E&lAK-U?m;Q& z6i+`4i@^ldHVvDi>x_t`ytoYxbGQ74rRr3v2}X&RT}ShAu_ynghlVH90N@&daaF-* z-5ql;Y7DKMJxSAsvLmeF0R8sDd#Mzp(7Z_FaTPdS#W}^Hx0zwDVL}eFs`p&fDKt(o z04p!nz?ruThn-LJZ8WVjV4`f?=Yu5K%ifuNT4vRM{1RJOt5}{SFV{)nD5e6m%~CJt zTve{?9JcFt07x=^luw=v_{eW#=vQFcUlyYh%N8-0;(B%LD>>q?oz^Hx*KHe6(e)=BlV1Y&L6jWrj{~_djFah&a|=kDE1))<)L=&xK#nNq<=wGS4Rmu zf@Tf&cOGL-WeDUfPzPdoVKPO4O1SrP_h`K;2gBOxFpu{RLdA?%s`2h-NA_Z>h$B_U zt%!@Y$kO@wdtC4$n%QrH{r09S(MUc>(cdp8jUWfN&g_24PYCL90Jqmamrn?g)Eae| zBQr6O1K?q2*$;uJ{Ns0qCb$s__B;H_k(KOn>Y{&V00Ky`h|Nd6-q;0FTR6OHgq>9U z#mjIp+Ai$FGHMIj6Z>w{)JkC+T_|ADQ)|pytt;0{)B8i&Wy-#IwF~o`DhhX@u#ALT z6~Dd%9~CY?d7>1;XQ^>-M$FHZ#t_+3E>()>fY26fg8-QgrF8|a2V=v!!wqC)H{>;bByPv43KmW2zk!-rD_sQ zrB^ca|NbAX$$f?phVDJ@36o#6NMlLiEiTjCW&a+JxkWGjHsSl{$n|SBN2vt$5-JQT zP7igTD+`3hfY8F{eFo3Xd1VRsX;dLvum2YBS)njF`yLRe?KQ$EzhoBi2lDo4^)0sk zI!r+1EKP2$00M9Sv+v3@YGuL^Ikm+pnN~vV@3H+b;=zD(=E<_T=8AiG>Bh>CQ!At_ zCb#hWc>R-(z_W&K|yf}m=0rr#&Ip^)V2|6%*wh$ zYPbA(qxm={Ip=$TkXko^x3gXLY>cakc`u>#qqf(PsFQGm&(ntvTPL zTe>|cI@+q2hEVV0ux?Jm7=1Ml4VP>XskfwmzXhVmE?=ZsO0#vLF6^oQvsiGj|EtW+ zCP}N)ZK;5|q_tUYdGpU&A3z3aFju4jAsQHyiAScMH!pbE5}9BYF}wt?w6kdc@fQ@H z{jWo?dl}lW!Yu>{u5pwX$lM2RR_xO5%o4!~@QAC_W58H}Co3uz=0lH%iCqKjgC9oh z`oY$Bi*JqbJ{shS$tT~SAJ+SoUc2Y_7t1V=g6{@DWFFwy3B2%;cej}PK3HBKu-<1p zcp%*D;+UkRVqDTL3q-pPo<M|99In2kePldO4U^e6pm8o&?)B}{zh3e&Y;@iY&{b0tUm4KwwjfqbyoD@% z+Q{GgBT50?^Fg#Aq+JV#045HxL$a@U40nq8feA6o4g=wVbnn6rQoHYF02%qSMMtb9 z_xw&VbB~Igvye@NwD{E3QEf7eQ zspcu;Tw8g%yaqTx&m~JQ7$5C(IRE-TY%J3K>NxIvtGpk?DVM|dvjm-=g3;4cb(tEG zLD@(%ebXq^!FZG_9B;mZ9D!U+a)t7+nGq;LFFdDS;yI6dvQN8LpLjplfFJKg~OyyS- z&$qL+$02af!AAsyUTB+3dJRNVK-2{=vX$E7{-8HMKiu~je8cnc(G zqyoHbR{{n;n6~%(Mh4wwPE8|V$bqQK^Hx%q9dE-f{&-4G5w}CsjS%r|EnV1tYdgm< zXo2nxm;}W*Ov7ME8S(xZa3E95xC5F{LO3Ur@|m;KThUvUeenHzYikbYhXT-FRIGkQ zOhu7HMn;bPk8^}uTXUA1o&qeS4G4=6)_ljuQ{F?EA@t8GE`L9U6fVkhpj9@_-E&)s zizWf1z3i{b72gMy^_lc&`=?TyBZNKh-^_CftpZ3JIYquG-CiL8R2zT+a0Z~cTDj8j zKC9$?c%MtH$oV1FpW_9ofBgqxQ2kZ(&pnV{scysD%T=h=7Lt=urYefo((bboefJZM z5VN?x$ytJKTDhtk{GOz; zRol0>xNv_9vL8-s%+NVTO>NbeB_e47S6u;90>ImIavP>uF#?`ViIj!8U;iKhZdtb5DAHde#=osTb6+IC$=lj;dim1&mLy zirW(OV$VG5eo26l&m_`0g2-EehyXDQhSupklYDNdjP}G8WSK6S+tpyGyI8IyvG3<-lvnw!=d6pw^S{re>&ye&; z$QgN&&r4uWpwPd^0`IX!?0z^EqqF@Lbsy6fp}=!wE-J=Z)>}~kxM+U={F^AGrvYfr z6`9^&C5jxpBDw&))K3pzGpxX@NfNk(y*lrY4$C^{JBKQ zY?}xX1T`$$R)0@i`u@#%+iz>mMg#z2W##9>NMV}GJqZHrvj7(?%duE7fbvTAF(Yn} zS_J;Pny$grnf9=j;?98~z8@?IN9diiZ~^@ib!*YKsB<1`2@W5xJ4pQKdA~eU9AQ}Y zlO^KjdwfR95ajkt67Jhm+rKQY+E9>PMupsGOm|-p5w_!qc!D&lzI4vvZT3A9% zVE zUfmC{VUNt8Vn@pNxbxeajW7ltTI9}i?k_y&(r`eK&Wrcfow5c`~VXA#wU^MR$xe@bITE?_AI3v z)e2Jy{>@Rc>MyOcg+fV?kR7K2yIe6of3JuJ2?i;KTCz2(X1B?SwAWD9$cY%dg95k)p=MnB7Kp9NyxGppBsJxf zGGwdxxzu_#kJ7*LY1JT~nWY>SdRr>1SHjc{N!i3I2fgMi^xe%2^?#e7w#vh;Lniii zKI#aZiQ&pNVBnE3d;dzty@(eD=Ny)-#fT?E zP(!a+`2m|cCnzu!vvLyUOTiG{O^&0BMrz(+cPCtkGNOf4a3t)UeG1De0JFDZ9`%Sm zz&(%7dl8X8;V(`=qk6n@tfD^m={BS-unwk|&pXtCo3#B!%$03WK}=1bzbK zmvg0O*lD)@GD#_4w7~W2L$7t5xg1Z1t=4!i@~JLBCLLxhnn*dJ9|_bl=q2zri%JcR zIz9Fn;%lVap=+H~&Y1kH9gy7T7=shK(xI3~LPk#ohA*{tTS!Aat&NZ_Gm5pmFt>h5_jz>rJi2I_=w>Rp2ju^wbuhRJ#1rCED?n zvG{!N{((xRp?V%#!?#vfp8pFw=lKQ&{tr_7d9unTpV>bbso3=*w6VKf>c3^Tc*Si0 zlQI;5$N21AnEU|Pmj!}SDfy?SCZN@qulY07N{hk!=faA|1W@ZO*TjgBgT;@V``3R~ zoZ2%%O*;zQxgs>xS8p>4EQXem&l^#1hnA-N<8~VWl&xg|mqCaZ0_e)m=Eh9v@55|o zR1W0YpjWh327CsDC6j|{zw8wg#>$DY&p?*x^R0^O=e6s5u;iY%X!^0~$UyxVuA1q} zY6WY0Ybm|QDTY_Vu!0q%d6FxkU-X7y?ejcHQ$Utx?5ixkUKC_%D06bOp`ld&V8G^Mg;`<4p_eijZ0|KKG2<3QPpTrl=wB3G_=^ z3wo_>z4A$Y_eo94ZMtl-Zsg-_BN5_GCHhJc5>{>-+QMyX3OTn%La^t1m>{XX`@Kxr zj7&}`qZ(xkdJq2zmLj%n0q6c$gzuk6_b~~qT>p?-2JD)awJ8VZcNW@xsp5@72&=6r z?LVwc@Hj?-c8^SSt}Kg&F1tnv;A0CW6C!h1FF=r72nbRLsDb{g|At#DVa|_J%vqOz zZ1>~*l82xbq6ouU#=llw#!{YBxK-b#X)4$Qrv|^YpHSOV_wVmaudpZazy5RPC!t>^ zO{^GjtClu;YX}A8-w|PG+WQPT5iiR?Z6edPjUZr9?svO(b%yin3zOpYL4<*7%0b;S zd&(>HWdXS7QeoZ7&zhIwh{@j|NpJdX2tz0e%*nyH^Sq70Sxm#+=QpbM6NcGX>wbp1 zCsCrU`#1o~aM6LapBLR%X1t#l=lZypE|fFB^5rvj9gQwE4;<7Q5~i1QaZ)vghy(%v zSCma-lbp!LSz^z(NUy|g>!J|KiORt~QwwfSo}gbXDgVQP({#yUm_6qOxjLS{M8JP0 zi8ak_-eX>grdh+X3+`75dKKXndfhenaV+wjVEMT*6)_yJCr4vEBfM$K09ZHY0v+eP z`o&7Fo2z@wgiJV<=UWy1-ckiUa13L{RKGfIE|Msa#@Y#|Y*Q9DIT zH_xD7R0uZ>Oj=7C{4s2r9}RQQ?%%ja4H)ARr5>EBf?>Yk=?sD3YkU}7nM4}nNw zAc%n-m5f&k#kSshpJN!d;wPD4538c2ZF=Ycj|BR-n>J<8LtIamydsX z78hAPFp2}5Jjx7|oEDUvf(0HVY`(o}J~x~<=%~^R?|M+%B^Tv5H>6#obP~-nFr&8k zP^(>Gyk!!#HFd)j(9(Qas8<*<(O@d|+61nq-aC>~XzYU>`Pb*E)OEOggM%zV=D(xA zx^Hs8x+8$2cJlm`un8rU)=t61J4(jXjGGcYAl;~WaAO5IX82&ayHqC*Z&itn?#2GW zT^y|Er>yGW8tGG%mL8G`8g7_Q0GNn$me6B*G#(fA(EsMK=%>VXc;_ z3dMR-n@k-FQt$agrnSxdTs(tDwo}d&CcJ-{in$BkChvrXw3*4p1@R7-CoFdbxHeh+ z@WHILq;@wY)?DN1k%5HOvb9E1ZOKekrBa2m+;PRHf?b79=1cM#zewzeg&&^7;tITE zfkkED+-DIC|GP*k8(VZ0fOu^% z7SJv~U&y5e?06&Zzg0jVIK@1pEp-^3)BD1ND;Wl+b1uG8m9D*0=z}LH(d7CqRfOA z>UO4TOIHgSAwlRD|JMe(wRlo5v7~-QiB@_3emcuK)f{n4aGKU24Mjv#5La=iUQb;E#9uH&3wJ98vCDHadtuw`R^9y#&kTavMrU#aD9S zs-mS+qaaY>wt+H$`c}>>CXKE2xkbI_f?{9BBBo^PZ0yPj*5*k<;00yQ$BMpAC!%yzM;!lj3b;%l-BiJ)c{XMlEU0`FExL z05$1bF>2e;_NJD|licSfly2#580}S*h~5IzlhzvKo(V{B>wdn4kkN`7CaIh^Menf` zaJ345zRx_Dia``(KWpvt`m8_Srfcg`B<@eP$l2}b`LGHr<~L$MHY(@U6XAm=jf>}UR&Og^soxqzlclVSl;P|o6 znm(D{n$P`5W}**oW!UaSgFX74xw1q5eiqRuQ&{1C026b*C!A+=W*V%&^Omq@6#%iK z$co!W&NLlExRPQw?qgU>ltshG1i`nRem`F-(hx-XEi0cFgjNm+_%m~|2;pK_PIw>P z3fUKsuB)b8!4jzrM8{uS*uD`e$Sp;e%pUh;@8SW&+z!m8Tv3eUdl0T5WPfcoVM5jQ z*}4byR?G-W^soQnu{CE|xRizpBE%`5+jUvPdk|TdK#rmJ*@MNXx9+cZOPK;pR3f=Q zN^t`v1;k`6yyyOH8ZU8dvuJKkR!k2;JdFl`psq$ph8;~WBM!?MD~gPzLWC`TT$(Oz9njb0H{lqt;O#vA<8;O@^5dpT!3}{2x&u)X8GUPC2LL;PB8#ZHN@#~f( z89S8v(owY7*swLmE9V425cPcw z1PQImqR0^t_?N4aUi#pL=pJ$_EABJknQUtnBtJXkU^9Pf(LLfV{JYhV>4nw15CRYZ z>EEMK0F40sl6LhL*A#2S_48M#WwHmT_ZXPZs^7Y4Fv~ob@EOJGtF4s1pUKjEs&9qeG|t)4}ape zbkFh^9u#9MU}}1yA&xb_m#U}eexFGW!eun25uIlM)@J(`8+y-0g&XO%u=u=j0lyq7 zH^121&=sYK3u*|3T!qRU40@jzVMsz3l~fXxMp?2lt)U4!kdlrhDj&U?IKi0t`|)C2tn_#3^Qu>7C<|3=|k(+YxgvPeFc zg4r+7!dp`T6bO11tu$!EtWkR5l5L`3?zxi7)2>+2R20-N;}V)V<`Q54MOhLSqcs(< zua>a%)mfS6MLYg|Hl2IkhZ#7>H6VHfdP~bT;$N92&5We1_V3Hy2uK*h710)8?1@Wu z@1_EP(A&we7DBXeaRh3FMUngL)%ogYTkk$e3S6c(flx8}dw!?E)5?}$+M=6Zah?&m z(EiGY74r=DnSGWq@p)^@b>*6W*aIrd+JGjMzWsKzsVQw=Fg3@&Hn&R7r0kj=so!}x zpvoBe`94_pmFz~KWc|IGUO)6S#QBi~g$wx-VoRgoZ8)$~~deR{F1`txUfsm@<=tl$So6 zb3u1FjX;~qDjF1kUzIhtg-P@FZxPQj?%tT?PHsbsek*Mnu>ggk*yK-fIPO+G1m5bQs9l^FG5UrhJy% zJwX7tQY$2t-_1KKvj*p)=2-&rZz^3`{AKk%0dMVDaOEaP`&=-@`BvN*pi=`T%zoa_ zAhl)xT5b$KYtX%6tpH-{9>g-y16B+Oj#lnv_E{>{$hFT9(1;p7pJ!>|s*v=^sAc*p z1T46hW??C0YxI06a^Yv)H>F`M0M#VtZ8=-%Tl$u`vX+SAV@jP87>cZ# z)bpv`=RG#LLO{~a^Rlr$X+*JBE4E|37YT8sIM9^hf`39BYkzrPK`Fy*B(@UeV0zQpz7;^g><71V{yHxBcO<#TfujKb z(4Sg4iW~Fs51hl4x4iGSlSp5oV>&p?NZ$h>;hsUX_^eOfR=pJQLl8E>@K+8MEjPt& zL_<17Oj?HC|H^+-z(TR2?IBjA3zx*t5H1cOhwe>T{Bgwlx4P^&=w(RPR}_A8JGEeW zYufgo|M{;|w~^^EdSBlZ%P^wwdww{pmk3R#T^K-h%QVXtHl1@}2=(_jZ7#WZRvW^- zI2kG_*s{Dx8$d z6fOnOX}8z165qdKlYD>RvK+}?>}2f~tK*wvFsUYj z+`AbM)3{bqQwYy&f65!G506@N2AhDiK`enTrd^RV}-^+;||)LV%x-59cWqui|iZOLw6NI8`Da;y`*N^i!Sz`F7eq z8JMjyoV!Iz!@q*jT1=0~`?D*&A{nqiu?;16>TImdde;c6Uv7Xq$>u?&_?`2BFj*sd z@ipr|2}&T4Al^VK5jpe|v)Jxkx;HJ;m?a^neUG<7FUb%cE`nic^JCtgC-oSW?!dN4 zA5?)WQfDtzhjl73AQ41G%EXFq;ftyC{F{gRuJ|oiOlzWf0v&+2%C%PI#Yjt!^S8Hl zDyetvQcEkGfoJ~krwbzAeEIu$xBzmm{8rw=<7 zt6Egda2kCVWGa)_NMsaCDfWrDZ>d)`ePF>56Vz6zI()0D&{`G|ZF;bKBCK}I5}jSJ zhs#VgvsD#xGHSfheR{wr6&*FSEnkLnj*5Rqz^&BlSD}H=NU6Id|N5Bx`pXTjer)F} z%i}2kZqy64cTEGkw`9UO>&gVk*)rS!zXGbdl^9GlME9 zriv*HSlF)oLszaUl%MO`=rS-ue5MfA$e#mL*k9@Ghi+~+972Q!hor#AOMF(|YMC(= zCLs#n-{TZV3+5sx5>gYQF!jlEN8dba{F8BMM`0^m)@Ot5pg7`QpuRSmzhsWH+Yf1a zc@MKgaC_e^bAR}JqNlbh$9q@N&0UC5F}=-INapmSYRey<$Hu7>bXM?o-O;8!_q~w$ z^sdVy>kX^wq1HIwMM_JrVgcZJX_$=4;okL%tGd!*9D4`A7zKv{N-;7soQn;~?%X#; zIAp9lKM6ew6H=)Wl%S+yPg_Bu6x6@|r!qB&YJSj0aV*lh}v&x>-9hC;l5 z?|EA@Xa3v&gITEtMr{wY+m>K%1GPcGZ5Luf-a{Lut(}g+kjnHcuxQ4tr60UXFNa5T?-*Z>TEv0vW#@B#a1@Q{6BoBG?N)RKtm_3 zp{#gz3`rHO7rzx5NTiEDAXSeJG(P@3*4`{6ko+$9@$MLmvjJy>a<2_gbthrL!Y&d$ z#y(-bw42bHeE=5`Dn1l9s^AZ#?tKV?;G#8A8HXVI5l`e#T&VTa3PMukRpkaY7IXod zH2(Rj+4Ukp2qG0n3Y0BUFv#Xp>7=YQVZ9p!rye9R#`^i916)nYLMU= zaF=$i!jPA54k|~N*+G@f7D(*!8$sD^0WB9n;C^+9E*ChSlIys?W+r3T6F7dRfRJ-I z=!mOZP^kR8ZJiCOil_MzuL-XCv-#aB9kO=K3SzKm#54y;f-S;?7Zw=5Mz%h5ba%v{ zHUU0Mpt>eo@Nx!`{9UdsXs9h>LUw5Ijxr1ZPz~CQJy&GB&BiC%?dj+DYB02KH==zCXtV?F< z*B&sL-}xWb_Qx7Iyg|Z?Uj#m67oZ<>C_KZ3Ep)%h0y^+44qwK{`W-Rnz69ptr(*3G z&GCqf4d=&V41=A*iBYK0u48-ThvIi{)lhJ`cErW88*INrX{o#dNtHavQ}my(7ba9F z#Z$v%tj~+c?$HK#X@kS1*50bDn`Gdnn4n!~xKfC-Y=qcF|>iO?f@oF3A!fTaTZ`O<6snGf$eqFC#>aVHiKRX_d;?;<O9`H6EoX_U;C6e`$jZ3|EGJ zjY*Joo!&q+B&Idt#X5(nBo#04OguA?stGvR0`PUPlgb%-=_eu!}O_*5Y{#9 zB>5`Pw-HkJqz2Ao*d@pYdVQMC1H^pJ5@#=tdu#HW`1>nMSbV$IMC_5K>G|&9%5V-U{QJCuH9y^ zCwCbs7Xd%GPIM}$Q3X0xsa%P@tW}xugR;bsvo>K)*?tZARIPBXCP@p(2P5Ai5)nD( zVDzTyxlyHlEN!)qB0#HEKVI_%*g*91T?8-SDy)cI=$nmHC47kAy4e8bftnc(j0a_ge!PfT zh(hx{#3kttwOLadCl)N7)@IQsR}6RjxAm|8xM!zB1~$r6_V2YMZ;vPk*k=Q?wqe3r zQdgS3pWI{I5={2{7z_19P!qUt0v5L!kqyX-SP4VplJG8knD!D9UZ&(vGOZ5lMuaZoa+lAx@`k3jvnP%nd9 zggZnO5~Q>2E2NBHU`VLz)*8iC#a4Y<&^@_i@mpZTxR7DZS!xuLL#6+oD;ICoMy3Xm z5-kY07X7l8fw(=L+x-=|jTJzRB{ECvA6R|GT^84WuV7UYKetwJJ28aL?@MpPa?z8= zKSGgbSLcS2RaXe#6N^3PqXy?EaEqF{WG^-7dtR-|s{)drfF1BBr{WCq*jESc_m@=` zLbEUhlz(ZVpJxSZ!vK;txDEUH)`~Ulgn`w0pP_0|7YOll&TAMZdrWL0(YAYzmD_`q z>-HHe<^Sq`ezS-9hiHc`{0;-IfPJ>zGbqv929Tc*sx{CHuxteYa6hgjL}UQHaYMV6 z^a9j-FjBCDM!1gxz9+ngNqH*7c4P(aZ2yJ`9fq~PIM>y0-9jK4=T9)9C|fB9wWVuR zYI;MYmf@b%M38BmORVs(|C#q#34|#sSTcPfnS39w`1`=zO54_=xOBVHZ^`p>D>Bc0 z626r({`p(#ruYkLW;$hX74|?}M;gjG|3DqM662Ud&|>;BvcjpjwRJld0_^AhqgApn z>7}PwUmB7g}a7`hgVF|s=HgVfD+Pv*r@;*zGb|)oUw9Uqx_p@5k z3Tn?wN9mDS(}t#uL}doQFiYL?K6Cz9Q%X`plHoAT#Ls~F4F|4`3d6dPyr2*H{W+T} zC@EkOreGRBUm5c;HuFSTQCJdzS+I+GKQ~3(%K4tr9#FlcKr308(1a z#|Y%wfBpB4(^?p{;w`3g5692muSX{6ptlS_OME|DsX-wm zW)59|QMNE*;ht|h4NL)aVdoH2FqB^lh(e^Tta`(x-1h{IK>to;tePz7_rO8|v`xUD z3z>C`UKz;=5C!pP`rU_EZKd|Cd%3WLZxgz`I9wS)n_CD5l`>M{Jc|&pdm&-l2rMER z6&5{ppMlmaa2W`xjj$|l8cc9DeRF)3;{EewMM?b~5J@_O0dWCWlG=g^*?XXk)zL`F zw8q6?{#Fa1zb98(F||Dc@~b4`&Bsm|F5K3A7G-NI^B{xzhpgN6(>(YrVkX5=y}d)3Q{IYnK8R)7$?rCR`ze@IP9_0B&|+hVCb zOF$ez{~)^GC-YWu_rxIy*E|zV8DLM^GX5EIHhSp_5N^X9f!Jaal32l1peu#Z%6-*S zfu`qearxYubLnz=BPx;G%T?KxpG(%?lT|J2{ft~;Qa_n`L@lSmjc#*0spWiDD+6Z; z$UrSUIkz|SSA%jLNqzkLvju=Tf%$z>WQtGVmenVLUj|ye}5{2O1vkO zQ(S}S)kL+RiPB|+TgfjKD1^wfBLD<%bn>$`ruq*DHggb-pXs5=kg3ZQ!}V{BZMMok zh9T}A_<6xC0SnZW97HZ+_q>O4FelIjYLT>f>$h$L{=EBO8CMw$BIonUo7VDI09>hs zG|vkFu+l%0^ETnagkgTWIS_Ke;<7*O%6#9dOQ%5avbeR<&iOa+{v&8edJP%|EDZQ+7FMy1UT6zZFRHMt zSBnGAVk{QUE-_^wO^U2j#SB$qPBuq~;b>Eu&XE}{DJ;_AD+rZS9-jf^?;lN|HsrZd_Tyfm_zya1}})v)K* zHku}W^OBoi;;u<%>)Cb@SKdKs_-CVU!c@0i?^JQ^xuo(N!vyLt=#Oh()PgL~3CWx8|Ky#Sgxx0*NJRm6aors7V<9@4>h9tUKoE!Ga&bnB(Y+ z-HP8Ik-I;G$Loz)SrM5$%kNN8odmLTWJ>QcbZ zLDc?cljZbRy%vIsuv-i%FBJl^2{JpK;`eI}r~$~NSN^S7Ls+qv5FPd{*zN3tA^qG4 zjKzTW^J|jZwEvOremYIa)h~5$UPMzoyU;?zlP#lHbLR7{?OQ+jk-giCDUkyL3qQ2# zi=Ltr!N2_z#);=lF}fYm7@@z_-(zxF8;U>Z!3dXBU4Q_|D^q)TlV=ydq7jAxIjHTp zI4-0)aACDapDMcmi1)k&><{~xH78jC0sAtQU?hOFp(q5y{1b7p$^i_?47b9{GNfzp z5A2drCRWkPWR7n~HY*_)KU~}cM&Q;+D=q_2=Cs~?-g6tj)mAky&iiaYmkC6X;d(y< z5dVLoQCLMbabEma{!_U114U#4P3pOUltC{w1Xv>{pL;HAQ(CSpL)ZfK_C51cy9(aJ z{UKlIN@W8-Kt)6=h|Bj$1Ufx}D`q6TwfrYSWD#K{Z`WIeK@Z9x5eq)IJssWhGi7qk zvI6Oe*>nC)ukMTu|#7(s{3iI72P*QRm@i1xeA1YO83 z5G?M%8dmL%*(o~VM!bfO6`L7YafwmEh%)O1ko1`U+(_&d;mEVW9CgFE(HxG{`Y`t5 z(c)*oQc8xE**EG^^1Zk?2(Q)B0N)k|Ojzb}TRsWlY zi0VR@3FZ0Y4m5?zf|>%ba-jDKu%-v2%n~+peGeEI3N>%lVt=C)XB!y=ff*7gHaNsQ zBf%+bc@r0vxmqP;5$AVB^jue&xi)TBrF2cfinA5rqhl@{SJ}j^MixZ?ryZ-^;O??+ z4)L%5xRvD@3uObSwOnO7HIEIh49LXEhWO{AJwsY$%e6AN=uL007H63VW_*C)Qhm$d zH0xI6Xxl583;|`zigfF@JwR_S1>I+B85$|%Qd&qW3{e)={AXdw+0d+SnFIxXn%ezL zoAWcYXzCWQDumnIt;mGWOXbkp4D zg2Qt~uDD1WihC?T;PxgJrm&Uh?ZB$kpXf;2HdtPt0(`VJoiCX>CmEw!R&%fZC#6`m*YZ$kzSNDrPOqH2ip*#zI@E zw;EF%v|ab}_MHF4e-0sNLW@qeDjR482PdTYmQOe*+tY}*=2-?n;XV_vG@)xqXvuBB zjs2Mqh-&B%&2Mwt^)?uSgSdlVWRP3R${p)B46LZG!uWWE!Ywdx-{%UZtKI@QXKD(V z1}s`(a^|=0YgdVWt3;Iem-~XjALUrqWf_$p*D5*?P*-(xwlI}b zu)zqh2-CGB0B+7gb0*JS7S2TwEeWIMZ#7$GL`i}B8=7Db6PjmZ{5Ss3!U~%@!z4Kg z4e81}qwSUHw9Ue%Gb$)S_@3maM_ba0jzTkvZb2K(!78=;SVmuvTMTlms2k+yTZ#!s z{j!|@%m3v5Zh#Q!J@Xlvx?1&@I4|G7t$*lHf=H=g1Ux&~V`)TS^J{`XdVDTZfypJ| zitCTef4aa0~ ziqxGVtO!^cgg!%WEDY$6UCvm$ybZ(sK2#`9a$m%Mt+8T*=olj|&KeXoYke0`jx>cE z=7KNizFf_Jj;>V%OqY$GH-ZXyP{exo7D{wE*~-@;|1MDuvfSO~%zA~Y5C|0PZ$a6I z!qYY(o?cWy#426S9f2KamkixV5Z&9mB^1jU+y!~5e*`B($+^u%iqng*T@aC4=P)8d z4f&!PAkn~;th2~mjB_L)KeGu#kf8>b8n1b-eX8!ZPcnIIpyY zGE|bL{5ia-#a#1LVNS{{LAD-{6f4EA1CkE$*4;q=`k#jPv%sDpqBGqa1k&&aXOLjw$f%{A8%WRbf_yL)ECh$DkvbSQ-$?TYmlkQ$Vc0R2=5>)jWPb zuoy!crp!5)!NElRJ<}nA-$CJ$&@PVmn0*^pSfX6Dkw{}@g5~=eU^_Hxfg1F!S}l~JARsP-bL41qhO$8| zST718NSWE3_X#hR!U{hDtKE~&U70yAl<=&k69a{Q`)$D5N@Rv}7Ob1c|IPnLUj%ep zql8#eOcF)Bp%jKNlrrIDzZ=%uc12kEXQI;Vx3?XJCK%|J@Nf8y>ugggFX|h&BwSt!jei$R)j_+sL9?HG>1g?`3{P!oeqmcw*E1~Jn_Ik>PGOE zjJl4iirACec)-hP7Pib84p_9lMC!`$K_&_<7apv;cJ0HVU3E&_K+!IkuUPL>-~&S_ zT=#eGjTsj6S#saRCkd|U0uPFz+O=t^n|V3p+L<)=`B@$t$2)6YC6ykt`V3_z1DijY z*%<%7Zr@^iT5P|%PZjWI?nr-6E8uUw6t}mCxI?U5<(I`Xzn9!Qmsq{Nn&Vi&-33ZS zURl8NpM8P5-6552gA?pl@S)nZ*Rv3_ISXE` zYU@aE$Y+B!2X8QM|M{sUU)+~+llq20v*=etLXw{I1JPV^Nm4mS5ha3>7RKOg*`-fy zJ-knqgyzsrjz0rGf?+}wrm$Ifpa5K+D&+>1IVmJx%n6YDECT^<9C6NL{IB-o1*R70 zT&B*?OGO}Ve>k=hE{hkO1kX(-T4E9@1x^cLrO?(Rs470jC&<51Xkja1Z*lZjc}aqN z!5yw77PEo3pNt`@R{*eouSVGk%L|#`_C?&(r_$2oFlKA99QG1fnoSB%sOC-JWVTMT zh?VbQIuoXL&SJDxd;w6n3eR2&kyB;>FZam>vmK8fcv+8k;bV6G43QKHOK7s+FYGyo z-?dKpw{jwrG2S?6hi{QQJd!QyVWc$50jiYel-6+OR@)^WjuTS^JWX8Kx7+ z5PaZY;4CqS32%rkApFy*Cd)H##9eA^pj;Pua+_-GN%XrH?`oL!KdgI)Wb*x$^SYK^Co6(kD|o@ zWgY?;rUR~Tsm|4bGfs_85KU3J#ll`I8EjS9^OK~mi1q!d!jHkM3@taNqum4d`6{kZXKhxuSpKWIfY)IYnmmM;3Z^Js~MzhDyP z&HB%r&8Uit^e85WZDnpU@?f^XK&JG%dy==>#r760qM)mp(QE%m!1Vm8mMS|9NlN}?5!=W{S)4=%0d0R-u2U$o^gUg z_A}%>X1GMS@9tVT^+jh1CCU``^BzsX9LQ2eTfjB}Q`4BnhEa&0!j4+jQU1v$|&k$rf<)Iy^$ z+J^9!(bM)>gJ6#=#GqA)NlXXRa2X8bNMT|I&*HM(13Z_%$Oo~=S<0H)8lfj38CULc zPhA-;-v=D?T=M*}M>YFs8EW^~KmtXic|{VY%=_o`#3ci{53+)SfT^vix3#rr8LWhQ z8v(d&*cHZ~EbDuQ;Lr5h&->3}H`A#N^VlJ`H6i@HL~fA*p_t1&Z)Yj+c53?mLwGYp z8`GooHWojLu%}l&3AX@_82|EiG00Vrx+4Dd9~aE+=8y@|HUQneolyOj)0-3=G(&Qd z{;Hic_dfkO7R>~jGUd;8sG#r`1Y7gBbOgha30b@{DpBoyHu#xz9dd&MvtO6d$pU4% z9ZPzj?TBwG4>-UH_;*EaVhqzp&M&~_)-Z&>bZ}gMf0RZVu+3*>ydLYI@ zdW2qAFNP^PGSK@=O}Y_Vy{Sb_ddPYYmu8rMGB*&S`Q=9n(5;^5BUxJZ-L^?$pDS35 z9K~)h*!~%o$Fa_<{)!wGzr}6nCS}U#VMJQ}QC;A0hzPcj#lyc3t_Xs!ipejtESbO9 zjIckFid7VDD9|bR*#z?hp1;vb-w&49<^b|%4c+dyG|_#2b^;KlJ%wc>m0OH!S=+M* z+ro>APjHW+aLa@V%t^f!)KpSa*po$J^TXRgB@k#@*~ikC=-+Ih5SnkH@Kz&R3COK~ z<`;Tabk(_)r!R8Oyp-_m$0sF zjOuo%U7GAy;`XSPLmYFEPK2#Z^%woG|3D^bZl*y5CznXy0{W#!=eC3F0SJT9LfPZ< z65+4fWgbo$I4_=jKZCQ&e}=c*FJ+d4^W`zz$@>X*pP+Q%1iNQ1Hh3NC`Evyfwq$|dL7F!$!^&`Matryk|Q>IsKM)&!)2I=~et(YKwaF zjHbCcqgBz1$Q_t{`<^V|60)L3%zSLPetMK$2>r8`WKdVX5r)OtC@$lt3!og;idmEi zFy%yh-UpK&<)`QNmOLGBCBuFHR+%2A@jmWbTUJJ8tyXXcw;4w{m#Ae8L*zG5F;wpJ zn}?2QG_G8_X;{RZV5B9swq0J;Mnd{U$~6rCQSj=9oq4EN+@j~e2$<)24A##W01?ID zR3-PYUWNz`zs~~mz{dV9&R2S?j0y~<$ZCq)l^aAR1b@LDzQlQP`aHF6XYGXfFZ9qIiE&wvkHGsQh zWsfGjX8j`QBe*pYvMWD==9?^_T-nJS%=q^z19=aal-Q{Db~SASpmfd5gwPcRPqa>^ z+y7whHk=14ASmSpibWim+y|7Bd{hXzt6O%Su!nryDdaaRyZb&h(UWzp*tpkJrxvsN z%brLlwc7lOdDy+^Q}7ANvRv)X`HQZ4eM*e3vHvYA?l( zUWibG;VDriAb<1ZN2^@2yj#goD0+iETaV-Cbz1jq>zH^^T4<{2p%8e1y1)eBLIs~5 z3mIIfO@M_SWlLPr#x|L(gA4(b($7Iod>Qz4YL{|RBN9PzZkXB(G1YRq>#jw8ik>rI zk_({k;FeLl%npFiJC4FIQV{IA2HHL4`d?cMKhd?Wc2Qge&ias>(q z7Z6eAzHfL>n$)ro(+XFZAiB^P7Upvier!Da1FBdTRY6Z7c^U0<3-^bZ8uhiKM?YjY ztA-N9oXqSpuap;S{2KWw%KXg#9iuneB40lA4w9|k`&orA;gZrhi$#2+bn2(6$arg! zmlJIk{}#pt2b36HfUmP14KXv3#aqkNIOTsgAx@`|smSvMJ0P9y4c^eN%J<$VHTke% z*Qp{`MoG21C%O6M(>G$)hnw;poSSV{9mJb4vs^Yj>yUq4gRBErk?U%u*JB9UZ1aV&Z9-aEy#nzmdgD`=~0)QFn ziM9e2K@rYYsI+mNn1ZPOu}d{3*FjDdbTn1S%;Jr2Ossp&6 zY$zYUJGb0JQ+16H#S*Ss$5j?h;E!{Jb)-3VzMCT~AjR?hHVv?&Ej+)P6?M3w-*QUX zVU-tqd3NglDc0y9M0vo03D2140N`7o?`QT30rLYy2}C`vc5%9AR?gApat9`hv_;zl zsn2kok)6YV^>*?AbUocY2gvpk3wlvgd>7PRQ@;EaRY5~{8>_GPPY>?(({sk`nl-R} z{6Xqe_6&>rNZ<~8=Mw}Ci?lUh*V{#ON-t&9?2j^}zcsCA3#j5GN{GiKd0tXq|BRab z*Z;WUv%kP9fC#j1DXE*1DI>&IW&AEU@TP9rIsIr9thn5f}0Cnhy16Iu}a z#7MzJLPRvgx6`ZdCm5FD-@VLOVLlVq)a^2-jmU20vQ6-w7^(Eaayz`e4Qx>taym=V zNh4+dOb4S~n(fd@bS4%3^9puM>Yo?XPJyYj z@e*AHV5AAjFh&9C;VKrk$&Kigi4?Tp6=}+;fOeR&wxXD-5#ri^{m0cv3l7c{U>dac z@hTl7D$^B{Q{brl+v=nh!df6eW6B|K)yjmTSVIyCeCE98ma^gVLMa$IQ`?K>Sp85axV10i&FVv-h4@ zv!+*d{#CVWDs^SnTkeoBEGyE>o@2oH>8(p@Q;xJDl#tMvoj@aE$-+>sODGxvk-K?g4whg?YWU6 zETIs~Fnz^93t>s5Mga9TFQk}6deHZ0G9&)=P2c+GN&Wv{|Nqzj|MmZyumAerySV6A zq$~NK|2Lpu(gXrXqv=>Za62?`hB81?WVVDFo`iinaD&T_r z$BiP!wG}P^BA^0mRYOis2l$O6-I_{%Dti@AKo+%sbF_6W)qAX!%?Q!6+`!}|}$k}gaq3IwiEfQI3m zzP~!yS8iGx#`<6VFZA*(x9Q}0EA>8aDMn8j?3YGqim&9~v?2p}ZpA_vm)_FItSf~9 zI*vBt{lrtLHtH+5w_-;rXn8+3>iJdqXlhTED-?Ef;=(k7uZuyzKg` z8mK2G&ln0xCCRdqXen?(IsdXLz^{S1ze?@1R>(N41Tawkw+8AJblao_7!^ymy=NrX zO{WLhM%iM$HA-I;hbHHk{ICD!XEN&f%{!>C>NmT*8KxFyt(;%TfUjIWM&IWmv}7yV zAEOCQE~NUl#mqsH_7;Bo$J@fnh3}t#PVUx=hm)NAqcEvkLNx7~iZ*C2vnK%eS3`;W znFv6I`wQR?3oo8&jz)xVqaPFN(Oa~TEr!y>hhNs)sp%S*mRiNVZVp&iP#B90XfASq zNn_6y-9f#DjWXxAvutMxC|fd+&%I^&!1!t53I?BzLs5&u?F}JXhD%wvKyZ3HtMtYZ z1b$V0X$2J;a7A_*tUa50pQ&#HVV+m&yCn8xWW|zvtB3IK&uax(nTL!17W~-T_4f3w z?YeH=zKnVqARub@;bpie&e>2L%8UhrA~|siKW?L zrdLd$=wBp+3$QjR#rVzAj%B7M)JU$m5b1V@SNgNya4^ZFc%BhGZ%zyA; zg@k1C`JB{cxZMLf8u=JM0QB4u$tCurZW^4@C5^_0Js?d@0nxGskQMm3@bme|;dHpQ z1bTYT+ncim=fD2XOS5-Bj+U!EaJ2Pvk8Kdp7Hj;wB6Mz@7uo_40O?R)K^xXqXc{SD z`ZmOM^b38l_NPHw2~w%upPww(bj+@EWKd#h|f9aSB!h+oGaDAz{D$dyM7Yg{F^b&wQ~03AX{Qe~?+} zZ)YAq7njfa48Y&z1<#w$1tD;^*N?%`K7J3glIhxBK?dHB14CY>6zgY?Gw`HY%& z=<_+~SBZE@ps@|FJ1(Pu+O3fH!6r;atBuS`n8FpKD~TIfQ?gU3l~coiS@(oa0s4L~ zS+k{NK36o)_xW}RSAct#p<-7VwVE`RbS+W-;gZt$z;*K^nPUynew!%xN9f3luIE#K zBEBE1*M$Tt8qz`rvnNbyYFQ2mY`?0=NNv)%ADa=a?EFn#RSlib-?K`6o`4R7MEw?i z?}9MG?VxuF$$`(-+mV5E;vNz)YlQmio}%H88P}B+LdyWop}mh&$To~Uyr8_#2+Y~( z|7=Koh24A;;^d+1r=e%0hznA??%FH55JU_Z2e2sTf}dkQ z?C(Sn7F;Dsi5WK^l4k`biiMh8P(1R7wk!+WsLz8|N` zEMtwe3a;~Yzbo9W!TFW%q5Izv^@1n@*tu&Q4C?=zM#IwHULP>&$G|eVbyIiKtUN#S zM=c_mNgeT{Ai#o#QJBU9jM%51Ep8x$VsSssMA_iU2pxcA;c$t~MumciecqB2)xFeu zP5jkW2(%abv3yz;5ZsrcvJyE;EyQ7AN6$`~$Acx3$1uMEM(9wNV`aYwLoB{*t)Z)! zx<^^dknytxc!sd#?Edbl`5M1NDa{ydhI}kO+82miXB(YSnYaa^}J{cTlVRc@0Dg(Del5Q%&)%hNeu`ngDRqqzD-*wzJ# zOYV7}0r%Mmtl9QY(;{CUg9v&AXw#nqpWzvkMt+r+Wrm!OZ>1Im!!SW=bww%$h#x!! z+>4~P^x`B-pd6jABE8>PB8v0y0#>=d&_9TSgq-v3K%tgyVCt67lz*K`1?`Dp=;y*Y zW!60xl^|fj{xwFHLGlu0c_YpD_N!Or0)J_Jtjh|?%MO7stXDLDQ58ZLAb;QdoWH+U zoDRaE&9~pCh&Aii;Y7=Wvx>I2hlDA-D*sdkZ-Y_fVb4}AQ}-kw7-e%_otK(t#PDs% z^!ei%q+G$CmAvJkz~uIdE(?SOPH>iBwq)8JMB9eL4mjK9vx#+!g?n6a1<3EgEYm(F zqGeAiNa3vr;Sv(BCJ!%28Fl;n7$BqDJ$~}7l!xLzFyU3l|FI!A(6Ak9_n~$Xp5g-7 zU1$TGE2zi2?V(W>OvRwESsovpm4FKsT~-2F6w197gqt6(iGwONiBh}H?BNGKFIXmQ z=RkG)C)TyAh{i_A3r6xwMM@RFD6NU#=ouT*OG(<^{U?i`u7xy8vk07c*&#_kTaM_G zDblL*xA-N@N#Yfp1Ll}6F*v2bs{u&Z+OFlYV3x`SH7zMuox+ z6m9p5LhOqe*SseKPyV1ppBw~gObp(=0kkA+7ucoF?CGEGiQvrTPxX)~HEuCRsme@V zV~q2Jg9GJkT5eo?#bkE}3`?L}u7*R?%M)3CF8U2lqqaN<8&fW?6nODcSw_dRqol>N zdd%QdWlhleyKm>5>cQ@{m&50W5AtaWF&4YRtvRVU$z}np|a@x5U_p7 zNJN{9(_1>fg|RSpEh1rMow~W4>`E6O!YcGcRIYZ#Tjh3Y3C|u(lDM7?&P&6e&^B(J zVOm!Bt-6iwy=MT*e@7qr)APogpAH zBXl|5;unPAy@jASmWOZ&!Bdk!D`}NzvLtq&H|E;4Y597qPzDdFyQBw~nRZPsg%A%e zopOy<=9=Bic8B{({q!mak9X_il?^MqF4L(HGvplc36W3t4mvTgVLKfvvI8H5KmwqS z(yC>|m;2UiEIN6<*~UdO!MZ!B&1pmldstk`#+PL)Mi7JtWkFKhJfsg@S-WK0=4_sz_p`4gFL!tn_+HIGxKze$-dwq>;BNthU4Bc%LNp2YU zNeks*sPBf(z6E7M2WHFIw%1sKP_nvb#b+KC^ad~wXHsdPyncm{_U9d#c#Sm`VNuDl z&AHiAs*Cp93xoQ&^Bdu4*Q68?2yi>abdLKF+4QX&QoKG~kFm+}L6{>Z3{^x2$j$6Y z9UO-1j~!b6&!sIC<2WfHBY0JoOCKQ@A!|F%Q!a;IE{i;~-qzNYFVT4NklQ7C?b4~| z($N}d-Degu0w;B1o{&c_sUeZYRwanht$VXhN<^5}V4ZWSIx#^V0ZTOlt@G(YgNHx+ zG(O<&ziC(#PF*ksz=cJjA~H_mT}&ROi~1DLy;BW=i@Qt&H&5E)U(E)50CufmL=G-5 zbl%&~wjKC`pVGN@1U8%>BT3xwhTt^gU|#?w+-XA-7sX_o2r|UD@fpTP^(pNi>*gF4 zg|dg)SwMhfDb=W=W**{w)KC^eVsJP%!4AH!35rcc8NSCxjlJntZOVsUfj(zoe(~#YFXa z%Y$^0=noNL)@JI{W(t`TF=ve)VPx{x)Dh^51R5?1DViT|i&PU0W;K!zBjp(U1FuBp zp=X9+a5_AVq*awOSZ)(sDQNP24!z*nJRA%ms+9Z|X{F+pz{B&YxekW^>u&&jE5U#@|ye=};4ULg!y%=BONvYntbBmQcE zJ$M%t{JM`2ZtuI@9CgjQ>{90Oc5kJ3c)V%5i-TYVvCNiz#*FqdFl-3vt$8T$>Dnb) z9F)hZ<|sl+NblWVGE_l`u{;X0EDPi%lY2TIzBEpiI z1w!>~L?)+MOE3f}{)*_aYQ%E=_o=kIv$$Qx0396OmEkNM*Q%d7c?&DM5FChwOtcDs zCQJ>ngT+_~cr}|G?RE}4$vA1^+->J{L55Wv&3~4W!hlR2T=vZU;%-RVXz94mU3zwfmbzesaIaA?g=+M!$svEbt( zGeIvay)WyVY}6ZkLY~AD?KIh;NFx0#L2nVIJ^REOsGx|T%W1y;0URA)l&(XTCw|4& z#3Sy(jweoFFH#iPiHK75f>S1W^AQ*YZ9Xx2uY$Mv(}d8ut03#%;c4ZC>jBv{OcH&- zjWUy$@-AM30QXKfUb+Vjf*l~K$VxiWu4#zdr78Q%@0#wp&I0D?Vt7mGOP4|Xv^ z49Mt^4jeqE-gu-E1}ABhFga^%r#2$BIh^gF-ilO7IF~;?&!m(I*u}ipj0-+cv>bv* zSdLh&C-B@*8N z&R!!>mwxr=Rp_W8=7vH7rQ+HWh|~JXyl)4u4rrI2%h<6&tdl|!ik!-D55=Mzq%sEW z&7agQd803dkZ&n9D8Hva|Mfp<4z?ET<(w-7KG|Ov*P1oOS_rllDWqfJswPUF?0IXd zhwiN6^P*T6Zr03WP0&sxL!tHRhe3O(U##O<>Jp}(Nzty!vs<;PERL5ExA}$WuKkr7 z6oP$LVXWnEJs7_r3aJDznCInlEvIlU1NTbsKSyWK)crl7QDLN*$FJKRF83KimfK}B z;$Xz&%I zX#hT0a`$hDRghk^fqoI9riRCsxs^QDWlSpjOwV@jv_%+j!Iz(V7ObC9`12JU2;f!4 z;?$yI)QAi<#Z78E36tJR45r^sn$vkt=D}_JJc1#8cxe*nrA02IKob#I(Tu29XMs&I zFoo%wnqNjlbo(|Ve-HFlr~i5i_-Swm|hMSNVPx%}uNRgmL#jgOkGq}b} zASDM#M8_h!zAHV<4hF0}jMtjLse`#iC{>-kR2KSs`KXp7e5P&d?e)7fcF&d~#rUey zVM@^2EO>ZM2CfLn#HHmLBtyHdk|tN@Ap71Q_-Iay;d@ZMx-3vq*;S6!I{*(^5f!I| zkh7h2naNhV_-i3wKq9IT$hb*l(@%Y&dRX=nw;8k=pWK&|+@q7%hbn!z=z0@SXPGUW zaD`XgZ?PqaRo7HaOPqHAkuYhh9y?{?zST>@{jpZa^-ZAQ?b!86F$b_h=Y z4lGOpLFX4p5ste?zMVYDfngiLj#G8##40|Zp)DxHsukcsVpl;iEr?T;Vq!Cb8iX(3 z3+yicLYxzPH7$bl4G=Ffla&XE?pUSx0KV_4Fe3KzWXw|%dE~#n-j&`;(mg@2Ym9SX@pl9x>{zlyM zU;oWeg;VCX<;Y|h>wZRzpD%?7oZjO4jNlp^{P?!4sfd{#6&x`+ub)>8={yEw)cYAz zDZ}_V9r$(N@a?|6&->iaebd}y7|G+Sv}^Y`zqkQ$t_lA>_}k!gq%kA>&Mc7;idcU? z&$?lPZWF78ihtEhY)*(FdB)tbS&LL2 z+{Buz8i+OA;yvWjG?UV80Fl$##^+;33@0KlP7omxgLRuAw&wKJGL_qUkIT?)`RC*G zlBC0mgBC8$P^kg7K-|*kc>$(IDF;%<>O|mzA%K0}BJ9uK$+p>cvY-F@AJ4c@V^LZg z=as-)({inDL0ajrSf*xr4luoKVVD#Mi{a{??NuNn8k|iJYvfe=_I$o)B*Y{Vi(9dv zRminatH1*4eWr7%n&5K0vCSXu#xz3DsYPfa%CAR0rj|Wr!U||W zTOo}IS?PD^9f;&U4#i>3voI84{9KgJXp0%v+l+e6e{xSE_^gZ?!VmyxwW0=y_%C0r zfYUe6Jmd7D2jDY6g0k}rK`r1^&dabCaQ!RIprOQQql`@HKJS@wxlAn*)le(zXL*3N zqrP2>>@)2-BTk1sK*Cjl7sq^CCYxT3bnsD3YcgAJ3EHrO2W!Ewzz#YJtgSP zF=*(0@XIFi=iiyba+H8{5liUBCrgfyLLAYN5}*4!@J!op=|>qDR7M(*G?)s09CD|3 zJJB?gUH}a7eQ&)dA&p)aXmDB{;#{7qK=4q0tO0%tQ#TDQfl6dYKCfyi2UIMk_%iw} zw^X#rnH)HBEx^<&;F6Zlh|Hk3l6pII8F1S;D@(WUx#-;nvPDvsO}GN4UvzG-qW-qa zdJEQ1(Wzm-^{bjSdEQQfBC$Y%69<)W|G~unV0jwwL(QODOK#KLm0hyT4x%JH{Y?TKN zkAm>J2#|FY@Hpm&U#dV>Rk_w4eBaj~Xzl9$luX*t5Xh{vW2Li8sDuXm^qwzJuVAAa&=Fx5NWgZ+R%C4BXRI=lE~ z;}t053ViCdQCmvhuyI*Vy3G3i>^nC=s(#*OiWjYk8tp|4;OqL;P97&UsU>87uip=? zkTjfyz>4va=IDi$c>kKm_tnp>IlFzTtRU~Zn`JnELSjLDx3HOOR05V8;i8ValG}R* z-tYngK1zYs&|7*1j|mr|RNIGxy|IQ-MKMqxk($}b2!VetDX1rfm+S9!fxVX0{f3vs z5~uz?biWou_)_}KwH0p=XGhiium6b-{_Zs3vf^5|`TP~j`+VEv?df6hN8ku+(IEg# zD!uc!@HX06vpApGip=PqGx=8=sl}fBTzrDuGsM zy^v`Lnjhw<)sz{ewgb1_hA9KCC`3SABJ>Hun8o)jqTn}kM$v}p?X}N>C}SF&*42GX z=v4~)Ob#!xwe7mkSA?ANimX|bS-G9sJv8mc3LrSBEXyg^{iozG<<$IkHeFek>EcH7 zF>cG*k9|D#gHJyZuJz)1L0H>$JL89!=bn|ZVyFovj;NF#3m_m~9$PaE#iZc8(AlV1Nj6F1s?ynaJTY+G!%7xvbPi2?M&Q1?14R zX@k0F$T_$3v!c+R!&?k~$WE|(#cXYab^yfvJ0mRl{NZC>0SFM{T&<=o#7Y$Txdk(f zl3wx_OoS2a174mxAKJ2iYs%+VNSC~2X#4>eN~B4IY!M)|j?jFQ{+)J5)a~!zu6!2s zKlxi)n#?{}P45$|mS^QGe=wx5LQvVqiOcCF>TS5g|b}z`7#7{B*mE5FrtWe#Uc?gjz=&8nF0Q49{QH49ZbcVAJ&tBT48gUH2*$|y9 ze@!dMGJMy+c1^3e`r8H2JL2GaptwCL-yjo86fCH$G%>vWit5Tu!qJKoovWTNGggsA zZ2FaMM9Y<{dlTQ{Hnl0*2&>>MdFD{HtV^uJ9Lo(JpbyH+UMHw;fzq& z`O&AtssIBQ^o215F~3L3BMIx{^M;u0tcwc^Vj}C3NJCYPeTA1cFeXA=ECITgydj9$ znyEP37qE8gDFP--qC*%Fz9O~opL#spBw0}Wi&2GgQQaR@>H@?MS#iBrAq$N5H}_6^ z@tr^9MvwyVTU1ngO+&7e9eW&HGcZ3a5O&g;Z@T`nAzZ4rtqZX?;-eF-xTaggA-KI- zqazF^LrXw3P(@i|3}A$9W_w+Wo|2XLlPQZvcmXu8b|)@p`U@sYi$z5We>@EPU;iUg zTERWrqdhXww-Qi4AAe3v^yZ$PJwpNr0aSif1|nB;tdI+6%5DI=1?YB!wqfc$tP3qu zM3N|FU7m#ut|+gPlquT1t6%>{>qW-}BV`~3V)ke`&!*jXq^-!k)9Dcd81);lkO&c6 zCS?P^|EcZSX67-CADiD)ep|ySFHQwlyGZbf*D z1e~G~)P^eSTlMWSM{=Uf0~5rs=$f7yKcH{%H~OLBh_Cu9F+K^fY;ZtCG8_R$VR}D* z)ILis%*r-BC~!k?e-Vt-bg+_E_pJLq11JoMTW!m|=!K?+L>N3@*3b0qpq4N|YgMKr zLk8N>vZgiyx3=g^mqx1*iIpof2)KR@azxiEdb^Tkgi)IzIhsbwqy5|c-~R8g%koxM zx_?e&u!ab?09fmV2g%%E)ngyb*l|!e>|rCqpE}km(4~Kpx?U6 z=6=c*0MI#qMAz#I8Gr}^ye;xm^g`Q7IivrV(>4<>j#79M`et1O;0b?E$-+xykzdaY zlh%60@Y!}KUVLf_VEK0OO$OiuawgWTp^~xlvfFIRe6sL@1t!MR8GhVH=f0oh%0r2V0;f z+7PI4xx|RS#mkxgE3vZMAHuX>yNPUl1UJ)x4KR}=5K!B)MpZWO*EiOi)TZL2@)*M3oAjJn?`@_R=EYDoh985{}JPiWW1d zHYhc7z)q)B5I%{LP6vz*w`i{(-I(ppc3uCCx?lD5Ma=A@@ zK&GpGwijCvqLf1vL1u{}Vx95%^v8 zLoB8F3GRM>jV84*+c1iBrBGL{`sI-oqJsYfKdG)N1C`e8X0SPv^6s#5_r1pyDsh`( z5mhyc!#{Kb+@TiX{a;u16bO^>NkB#=nD7ZAE1LO*E=(DkpR10{6Aeo%G-lXz%HnS8 zULRep$c0f9%-fAiLV@cV!A!1rD=?B8;mFVq+`Q&YakmFA!decPpQ0VS%7OBS12vb6 z2%?r*;1B3u3^x^j(@|BDvQ~DpEQAp=NnFA%#{Lv|%~$JPB2?g;J*#|h_c06JlQ4XY z{Q6hH?QIanz3>?b2#&<@Bx}SruVrE{w~CdY@Kr2sp#G_D^27>6g+lBhR6wu_bY@|g zLAKCQK#6^>K$T7i*Y7W0vYyYb&5%`{fJl2S24b+NtAV*gZ=Ay9N5O-$$;}LJc?2_^ zO0J$=Rjpa5{bmEg!DK5m@1qkmdh{WRLD*NR;_;lIA1(&>NvnP`Q#-YJyBzE7(Bzz2 z?8#408(8xepZ!wJkOVhJc6mYGC6E)A*2ht*!M_pzDoGmV;R(C~r9`h%sgyZ2l%|6s zM|8m=Cm!;hm?4qG%l-vFwzgyJ*VN8*_OOXR*XIO&LvWFVh+E2CH=Lucc=Lqy&oiCn zx8QzZt#Wio<+v4orJDbQjMLiPif@tWIcUOF%;_Et)fA0Yb-lvRhg!0NXDj1__k0NQ zKnbwo*z(g!%ra)Ap9l;G8ibM>1#r>xB5JDMXoQXPE0n=6~<%=U{6 z%L`77s2AH857t3zyQb-3RTyAZ5h*u7AHu&h>stS> z;bvCD6J(Z;Gt8_`=P>lW&;j--We7|Nuy3|hIlnO%)r!z*_)fD3x!eu(zKkkRetuAB zN=iPw&=v?z_yGJX>#~>1G=@F3lnDm7 zPim(~1L~dPTLoLZCxJ8qn31@cj}cD8 zM_U*frM+q-hX`}_ni@uIf; z{TitGnK3;k{TU{GQGr?9T3l(y~tt6Tv@IHl2e_PYbxES}QciCD5K|%ky zW4D2m0ZgVfd6|@h?(iqhIU9tUsDrt=@gg%l+Yl%}eB*Ijmi?FcO?cyPe%sE%eaDlPkI^1kjLOW&hYXNcjbfq3ma8 zQe=sx$p-wsq#>1;+|Lf=bBEb<6~JwlO%CLGpPw=^CX?F9#l2;YKo$hoyOjAmuME(2 z7)K#_$)qJ#q1w5zS>`W@j3P8(kcZOo%H>@;UVMtccLn-Td>2U4h+q(g$KlIoN1H|I zoYn&-BI)-HVV3WbVFo<7+41$KF#6$F)D71LPz8-E+Y0af%4Kx1?f;WFsS0*`-O%;A z$&q(Ir4P8E7x=XVg!)uapIC;M&GgrV-H3OFyC`oq{p-60WMcgtq8IR>J>5>`HyG?j zHnK>Uv4(>BO=XZFY?PXVk0q!L7qQv>3ub|ROSj0jJ);y^4mv;HwYNA0wzGo}e4_1U z@xrP5K{#May^+?GbV==qB(WpQ5en7WoXp>1A53vYO=zWzk8F_moU8oTe~SR~dm z4op;iW$lx!YI-SX4M!}~ocesugxnsXzuy57aM(t$Izz(;(IUSXlUa5vq2uPJ!(=)L zEk@8wNH$5hrc5y0UU7(s#Nw>WzZ10jr#Wzrky;YVm3|E@_4eXi^;OLZQE0gq_fMA! z$nUA#&(86dMo7&`0&QQ`FDN5@TayRV2J+i7r#XEi`Hbdlgfp04B2m>BRAo4A5`ehW zD=>=QhO*_9)=2m(vW+RlP-4T=HU%tg7^?VsYN2+O#^J-h_LTdphp&N2>} z96AgT{3JfUWSO`PW;;wJ24!mcR!u5fA>cD`2k)~5?LNG98#O_Hx?H9KA#T3aW6EXM zKX$tNT%=`df%^U#pqaFT8Y1PgE%9wo1948g5Y5|42fXfWQ~Z12J&3YT==;14lQd*{ zkFHW#bi~WJiH*4VysG)>#jCmn2ayjPB|wUrUM+?1VMg3C3Rx+I5&A-0L?)ddMO*r0 zKBHUHCYdemR%)eD7$$Jd5#sx-08TxFB}PB~ezI)4I9teZERE7vkzOhhv*oJ69MsBO zGLVfoh6!-I+ z{y7W)6)I#8Zlep{bHbNV>z2c(-Y26dO`Hf5C{2Q9wv+czS$3<&9uY;<;ol0GZ~F;6 z+bA9tmQjyc6;`yB%~ggLxuVvYU4$^i}Ixzb+OORuJe}aD!Zzw8#1|iW&PO)Bx7A*c^Kzqj!07y`r{@Am*lvJO$JY88*AP zi|cv#0KyX_NT_A;D8`&yv*2vGBJ{O-us|m!J-HoUqvxQ;l@OU~=h>U1+j*vRG`HKT)5s({9|zdI`)VOKdS(ywRN!cruw13@7@0>0*rmMDsgFyAJh|0b<*FI(s_M4usvgXvEqJNOxyXQN*dmvrD0{v(Ezy5D;*YI|TFd&_+)`!xMC~kg^W;P$Z zYGXmU5N>*l>EX7+AkiMn42|D{EcEbFu+RfVmsRh!0?4G{dY{NL_^XvZOC)7w^tqhJ zZ=#K>j0gX7D}Hd1(dpO*k0n5b`#`%M~vSqTjib|K(Dp79eCLpN{r3-!i1NQm__ z4Fi6jF4y!1$AMLYb>m!N&(aXBSf<{t_iTASw_@qCwuwuS+}gTT0~D?l)CQ|^-QuE` z_H$cr6$f>ZLiXu5>c0Z&`?zo`VfHa{%%RT^oPyn2Izq<~en);ptKSXSJ;AnCfpdn@ zvPYW})8+FrZQgdt&Kcdq3!%b913|T`ST(3&o}q!pRNx8_S+JW<2smJ#-5yy?xyI+W zX$5Y(aDjtZ*@B-<@lxwvx)b&$ut>oOAhWra>(5ZUQwl|`#lmPAAPlBxg3O8l;O`(<{UArg9Rv?svZ7+L; zeZF{X5P^Gt0X)L0Y#B^=-daQgwu#$|T5=1akd@P1*|i|2O_aA3Bq;9cY(hp3X#f?c z-vWASh0wts|W{T%Fh2%z{oNS;TmKG`FN>?&AV)jC}%)4~&BmgmO#}|L62Cv4z z?wGqte^YNmnQRFRN87Y?>#nzrW-X z`$uQXfDd=VHw8ajv~gkg{|bXK{`b>^K@EfG2fsrJ<<*4tbRMSiTX)RKL^L(Dw7DIEzEhRod!t|H{P}3m5(RLgtDNSvS zV8RW{6G6UAcY{Y-eS` zjX55nHYaw0Q>I4b){sA!5?qoA7X>C^h_sU+ZP9%da`5BkD2K?;@Gvkr)CP*>(k;uZ zP5M6MTZ_ikJuX0RFWmY&Zjd$v?LI@U7(Z819F9=tEw@1qY8wHS}TlnYG z5`BNxA_Z&oPu-CjlsG9G*a}oHmB5{)^wo3QH4({mPyAUKx}|N$`FqQiq--fj=IGm@ zu&hHuUS%Z|RB!t4z&F4oEe-Q^P>U`(WatfA)?)l7sJeM{6 z$;Q}c;c_hpCo4dMNTwC8lG6Ju=b1>~Cx~?043~nYuv+HWG9YL}E`Sb)V0!=SzgfT! zNG*FYPZq~f2%p!_G_EQprd+kAE8P0YTYom8Bq%Kxl?ull#SpPFh_Xfj7-^^K#WB5) zWeCnbw`qiQD4PPHQR8JT-{-gBT{N+!N!q zLj#i!Ebg?qB;`~MEc;~9D|c3b&;5)my5|+7Vcmv-P`W(`OHo|c?fKYM9=WoFU}%b( zV~b9`&xrNriQ^`{mAHD`i?DPV)XD*geHMfq^oHw`Id+E*iB!#q7MGevw56{Cpm3{# zpAOlRgJ`0_jp02E2<&IJ1Q>l$fQ3!zit*3UY53Q4Faqi)uxB+*MBD4qx`!Io)|FW> z0IR)tZf_RUGutb203`{Zt#yT3`0&*);;`%yes+?}22vb>5lnth-CER^t`{>wP9b6-! z6Xra1SMh-!pDMa2sM4lUMUn==Zp9h*% z5G>I#({bLEcRriiw`&&yTEFk`xHm(gkGJe-W8d!AP>}(NaxTG}uD;@?$^}h5*(M67 zST1lRIS4T!hCJnqg-0(6FbecD{u3^WtpECNB%E5(*yrcTfSGv+E<3AGo?5A{VtlTp z%;O=BwfY&)iFp7INvxnLmr0;LbzULGTOj%iK>>92{&}T-Ygmqk<(`0AsHQy6QuKY| zc@PlR78i(q2W$a=8PKJTVxSy^DVG_PQQtmaQk#?saGTDV{ngupMGA2b$WbeH#li@O zUz}0E$UWQOXsWPi8x-G&D8t1;&o6i4eB;li1s(cNWXM|wpLg-Xrf+0 z+zPUFzLHbFy%!8j5tnisF6*{cD~Fh>`wPnnPU$KL2q-Cs#aF@<7j8S>KQQOpBw(&m zfmW`+gTmXOD^?me(uhmcp!U3#pVxSs?pS*B%ta41I8$%=2ootp69 zBT}3FY;96tA}o-D&40pi#i}Y}0k5${y+p*?5f^cRt%Y&$693|XRm-htfRAPIy+$8$ z%jGBZ@eaSIas=M0P_!ScTYwjC{K97CCAbqo z#TG4uRUeJmCG|_-S3>g^afvbwZrg{Gt`Z{Cof|jE8fgu}(%sI^G`75!!r7{Y(xF*_ zSV?{U<(tgXwgjn!nROYj4S?nNyctS3jVnzW;O*`@s5=O-qa#VE%Aag7hs9q-+ykEs z+aRH%Dm{kD40f1bs7&weADRU;oib$}v+LQTlHthB)k@?cJIqQ8{z9BIMU%1QMTnVPuoiWjO)j{7_p5#3-GWb;Tncf3dNo|Qs-v6c=# z088SJSJx#O_5)m!w>q_?n(`~MisT2od~?Z;>Z~Rfx6X{r!g%esR7Jg1ErY7{;jWj$ z&ig$^W)H13pm*#kU%egoDwMXnei|W|D=&XM*d);&COIoEdzvss^#^AXiK)Un;KE)Z zqV83SRGOy*33eB(pgRSg#ZC1=VmeCzY!g4*oB4S}Tz*Oq_|v&i+}!BU5kA41&L&0c z`43%kqD)$H_9N&P5CH!lqW<1mmY`P_#NxY)!Uz;Xpg@fwQ-TMC3AyON6fKzc;B60H zdobm}qz8lvkSK=C?Mb8rymUmb^CA~b>=edZFiXx(AFuez1I z3wyH^i1kV0G5|FKzXYfJv(cy=AAcF*T#=Q1ULGKq;?ccxRdaK&)$prYvDOHhe?sUo z#9!OHkdYk~5oh%UeRo&tP+c6ItX6JtxbvgpC>Q#mxMNq&iqZR}W4wdUrI?esIbk-6 zbj^zn^>8+t3o7bwQhdi97<@4LM8!KorxWZXn>X}ko$A^Eg>U5KB9bk{Ga@)X>bg;Y zdIR{_s& zJ79f3=kI3^{#EbV!nuAsx^ppo3+iZAB+=D0@$)uxs9Fmv%NtY$ z_)v4n?|=TM6>`u0N2b`dfRMTk+5*GKj8@P-pF_h+t();bW;*H7O2dRqKCg%^>*sQl zvqu;jtYI0T1~Tvaf~@-^m^Mut$h1Nx;H~yn?A}5UDTZaG=xM;UO8Ar4_x&r-2J3#- z=hpl8QUtm!hXE9G?BQo<0*TrxpEs^uBpFLdD z^SloyEQhYuGGYP?bNJ)vj{}n-FumuVx6n&tpFw1$aEsXj7-*?cz0V-$O#>11eU`2f zh~e|%S)Lp5eT?*^Fhp$#KemZVGzH$nc?Psk6#gAXo3_DPSqO7Kdor?sRgH8ZI(6H9 zA({l4JxO5bC(Ho%lU2P{@k(@rsk(@u>HhY})Q<`z;Z1{C2@Cy(sxq?epv`C*BK1CA z(oZlTK-Vx$gF#3u!YyDcvtX$Mug1rcHJXtnvTUd81>`QcQ&;OQo_5@#(pR{%FT+dV0*cuCGo|X`&{h z-tyt|@C$DE9aN;S0xbdS09gi6YKt;~k)m)H3B0?3JM(`vLw`mdh09z?D0dAb2J!`B z2R}(}{*h~EgN*KK2eDH=%DxM2K*gT(ifAK2Im1r9^&n}uK&D?Bu zAyZ6=mEhK_)Bvm@jw5ghM+5e83szl8WAxg z>aPiESQkI0hZ|L{AOHx&3=a$!288xpRxm{{07FMWxX&U}Bpvt4L^_xg{$}ak3Y~p6 z!I|=z{FX=6a{0>#L&AVPpyKpB{Gv#t7O1gI)=NCUnhK{B{?f+`uWBk4YPkm^)SKd< zfqU-hO@7Lz_iRW80H7t6QE}Jb_j9A<{2hbe2d-6_wDBAKh74=aLm%))eH+*?JtwPh z%M4t!8^hA_N{i-3bg!fb{lg4~T!MmD^c7BMuoY zia*zy={B>XD_eBR&?qQdBlOB?a~k1)^G|0Y6Dht0pG9r7tkrF8jcP&?vS|X8K@57t zZIkYyumQ=1WW9R6Jrbuk&;G<<(7*9L8uZ5hqEeu>=@2G@SONqu3*;4P*W$S}nX(l^ zBYA}Yq3V{=(-*ZAz$`Q$#2}{jWMzeA%D7A+=gT}-(iK1eSE=&{VcTg#ue1V{p8+#{ zfo3*+YhCOi^_8q1EmN}ZCyQ0P_O};&9s|-wfO)P~+8S5!kL4bK!tL`ii60FJL@O#U-Ryvim*w`8lvXP{9Av@3<@m?T+N!_!fi`%Suzy3fmQ9uOQKBq)M>3 zOzDlSd*Mp(?cacfScMh&`x^Z@`ZTO%hvH8Kt~Z-N1jq^S=kRE``G#`+ijk`{SFFjV z11UDN?d`LnvP-;6S@TTG_WRa6U?pk(KAiEB@nq0IH`9n+P7c4~B!P9iD-9RtMZSs&7@&XSfV%dT=S47ha zP^{SzCkg&~sTu;M18*`|W#@rIoMq zUW-8WHYb{vp{gQtLNlk#j1jI5NDBthHYQVr;8q0dN4slkQC(2Gr7i!^phmwd%|D$d z2Lp z{Bh4xJzN!o;APD&bCbiXtzxz{!YNs7zsQ;giOuGgyi9#~p3WR3d4l6G%Jq-AX*N&Q zO+^Ut{^x&e)oMO^jW+hIHG0;mpn(e+MD5_YrOSdgwf|y}$+Ne9!}Eg0009xbpKl{{ zanrxCe?F@@D~EjnmMtO3v_&xYZxQ$y`4zR4k>*qk5rI)ZL)pU&0I=3gte*`5;VqX- zZv`@tu-{|7?TTxe#**aybx*}Vi+0H0B^gRsOJmVoY!lE<9o}+%&w!x$tAU8C7Dy*R z0-rgV96AV7)2n|iNQ)Yi2*qS%_JTn2yb?4uZF3t;s5$VD%jBYNYj9rwMMLNalU8Cdm_sUw`~MP2RZ993tOg^Tbyz#5jX^C#lYjb zbKLK_Ef`w&s77#z$h9>|lYGyz7W$cqx3DDUz``_xOq~ggvsU^Z)8$A>w7>I`^-OxJ44M1>32&$`f9Hg=`|ex?P46(A%<0jWja{fqv$<8Z`&mG_CDXnY)MvJOq6-nZI=aN%IV)L!L@WB zjI!)Se+f#4fi@uPEjMbnuxFT*k@|&Z!U(@0$adE-9@7%lq6Wy#0ebL-c3^i03j(H%uY&YShc55o(EKDw;Edi$6xq$l2Z$ zvON_t`|}N~&#y!sM_WBh_*axliy#E$x<%YxIjVSnSRORo_04&oM7Pm5`~euydS38; z7Nh+1W- z3Up`N{DFZj(`S+6EpEcPe>;7J_X(eTYemxhZNbhK-{!Uhv19kOFESRge&Xs_i=djZ>UImr1{wv3M=|-$_kgJ>aPRYXWR>ld^IhL z+gU0xDzR3zz2KCqW{&~&TLIz7(6UVqA*`}0Q|1(xDU2*l?SFMctwO-t-eP^tQj9m- ze;Lc1Q?5^P=9p=g<#Z}<2X01Hv@UJ0!E;$JbK|w!b>OqR3m@^^TZ^g%Mpg85Q3Z$Z z&WHJ3K^{a_=czLVCPe}%CSn+-)BS6wt1&odAQX>&t!@l##)Sel_{+lhVbjM@txX@` zqd2#CCJoG-BBZ1}KQsb+D1V<`^}r{g`BoANEJ5Insi zAWp*a*PT5OV9mNI$>pqUiY+}c(a#970TO>|HxRV*g?Vk0mie-Mq_T zMh7ITs?6v7WlAqai40hmac^iMu2t>iO)^{tFTo<=Pcs(7;;EEY9ziRrNI|F1E!3&j zlp=Vb*ENHhl%tyvz>vs8YzDt|(SfTvDd0c<(?M%`EQ8XOwzN5WpE=*sw1@4ax3;%< z&vX_{dyaWK+|R{XZHzjTS`F(K24|j>@u_w?hBWQ(VGTiy(5|;K&rbCw9kUG$x8QQG zD~U|~>VE8JnBo|e7O;39r(g>uZ{$?K!q7yZxV2pHtTpHWOPSE1Lh-8e=7TStv$`IBE-vfk*06S7>{%(s&IosR8Jf>4l4!)p#OjEcNrh5(Y z%t8iT)}nKdQ3+63*HW?eV19H^s6pXJvaT#Qthlz2VT~W}b9+xAMQ}mY!1dD$ z3|;awScaOqher7=)uylV<0##6w8yYygzcr}?!C2WE(xbM^%lh2aBaCpL1mf?$WtMD zqKM%1K<;7L?7zm@+wSEO`ocr z)UDFD_VGrMu<6^T#V~kYnF-Lhv9-u^&+Ydyl77$IP`H)z3c()Y@0ePqoYDnG5=T9u z3-RF!n9d?{y*$A3hj1o@<^M97A-@1Bj!VW~o=v~>hmqM`eIB1Y&<>VcVn`QuBvux( zE+L&kTu_qLUmM2t{xLj?lQi)G1|~w(8xVcA4j14}=^|79q%gSr%}v2|G=_@<5P#ME zOeFq+PwM1NOW9kXNYdHTp-(IE7E-ugm0Zc`pp~MFyGP2kR4ysj1TILdfa?9RHB5|m z)*FdwPj8T7a7_7fuwyfY$AUUM5{niFdj|M}OE2MxDxdD2ZsU6W@nL~K6wdU9qfd%G z$VCDYsp4$-j{Rl6dv?kj%+iz!Y`qi;yNDVzY9J}ld0#bhHVJQKjROv0c{81AZ&jwr;0p6BFpRZi% zJ7S5;~t3`RDv3u#C$*Vqh}<$kTStr($2vPLIN!N z^QSXE-V*8hd0dBb;o=EHjx=yDSrc|vmty0RfpR~C8N7#Vh^j&c&7hF8NzDToV%c2?+&9T0#rcg|^9XL}p4;KgJ2}P_Jq= zB&XzF>ps&PxJgXf%8zjaP@}C|CTkVu$FIbldjzcDx@TPoXv+K9kSn?nX8rfY&jF%; zs#Wn79m=7fjJRox>z@Lk^AcJ@PDJ1y#ikY@>Fp^h3#D$GwScvU19h%y;By5Fk+6G) zT6SBZDnggp9OL}sb%M1 zU>2}w_WaHyaL>vb(Y3!{OHN@)ma#4?6{j|eLez?Z6(J)H3)a%jWq_2=D`KH={)@!D zQKJOOUx2_60lci`q6Mg4BkT{I%KG8vJoHbo5?e5`BJ$%=8~5w#sxsk)0zud^=HD2N zX>0_lG!_e+2g_5zYw#u<@{pK_ z6(H+pmc9G6Q*;6F&QdPawVsC`8ZM_+266r1c^7h6ZJVvJh+=$cm)<5Yyp9Nx$(D@)M6LdPFxZ9hYDnr z8bXR8T2ESy@OT?6WVLRtU%W527GI*wWUAqk(Tll6>hG_f-dz|E)V?(@b``{CEoiC9 z{>M?a^ucIpOhC@$C@YlDLp1A&+2*5%2I0&5FmUTr zSOo@Z``>#zPBoH+3Go$QW<9M@|6x^twS2~QOhXo{cICY3KOCB_YTRST*FLIK8zIDpQ0uHR0njUrW`{c_jysF^?TKp)NMmPgI@zWo7&Vr_t=2) zbWc8?2dHl^t$-1KTNrdBmn(q0ZO-RCxBg`&PgW+dPX;LLnW8z6k}U?xmg~wCz!3w^ z39azAPlmSv2H^hf)b>sP_BU;XN#ZBu_2-J)J|hmdjk>*Qny`zjAH{DjfZr2#5cfQ} zM9Qf>%Y^+tuRoysr;Xh{>{qBZ@V-w5i?_HAhHUsf(I#I4LR z_n3xd5i4PuE6$m|i`sU-B{PonYv$!*QuAyOL}_p6!@#=#`QP`9smm1lf@k|hDq%oa z6$j<8zMmn$#B9XO9@Dk7MLMbKG!sHumn~h|0RvJaGn^LKffXZ`7A&y&}udO z$!{z=0;Uw0ki}O8+-I6Ea)56Nn9A^9Ifb$Z2Kp<~2Y8<#(QUraNds_WB}4J zRkg?&6?#$X9~WWL)!LdWJpF-lBGwv%Rn~tiz}fKgy?j=sJpuiz%Ie-z_~|C^kokjh zVU9MK)AOP?H$?=cem8qwhP}rx$!`0xOhS%B(#RSJX@Z44w}b_(XPW0Yk@R$3;@k4A zyy%+!b(;xu6+D4D&!S=4D>$D5gNVS8E7xfpd&Fv8GKE5)J*ppei!jtcl_a{u}pCrHFe8u-iy4jahfb~ zCjgD%M@uiUj26t0b{CtT<%8su)uc+oJD6$+iTEbVXNp{sb(@&A$*A%{c1U5GrR2^Q zg-rxXN!RY4au-Son_}Co$yNhX{w;OiYLsa?IiHvO|Ni3`&`y=rgv}VkkI|3r$$&Vh zivOt!O`_i$UsV`}-FTYZ765JFX;kkLPbVorKa< zwvzHv#fxnc!Fy;zORe5#%c;0|)x%gawwH7P%Bakh%XNF7PuGw@w7(J?lS3;Qqw-TRXp z)@@H=)iA(O^t)<{Jv&8sJYxuLlEmqUyQvAK8Y3JZ8K=-)A z7d$tGEjP`#7}k}HRQ=U|{ts}X`;1rozDui}R9(4;(8!^bF z3TRoQGw>Eg4&M_6vCtP)91yfFkMC!<0p2Q1h4W6KKtT?e&_G$7;t(`-8`LPwxw4kv70=Q% zlygl)EB;h40R}J|`MpoUgp4U62=6aXMojRktN;HXHr-ujVe zK`mJufZSiQzq9xecUrE9@O;}@eC`=-rD7L7&w{lbecIA$&P?VcHQ= zX4>~)f7H*fcgb`ifr|teDFio%=A1BL!n!2~xPSMtw{gCmDFx{_y~=YTjQZO#{ap3S zHsA9z=guFpvYWTaq`Xx&=kpcoeWtu@jLKUu9U&B^KnMQ~tRaNpJ?mGBD`>$QQ5}S&(0@CGNu2zW^%gK8$+!_#LU_=PEiTGg z3!sC%@!_%DTq%5ymq63Gq#7prZ}D#qPe`I>oquZ}o|Az+M7*L)Nqkq#Tp{)?ni;}w zGtn~e*`Q#p(bsb+Y{~)$bGK6%i)Xa6(&2z;?RkydL@PAKZc!He&9MXhiB+xzuR{Qp zcAPO~th);77A`A?x`Noy-*Co)ueYga`NvL$Hv~GjkpLLh5AZG-(obAbKb;P#FBVVVjvn}K_`5^r40@?~cdPe7qOOj{en z$c1uV`TiD6)lCYrW(zuJa#yf0Ya%foW{UST3p;=+KkZt-PyVJL)t0e=k<@?w-$+*L zvu#)|r=s7^HR_6h7Q85;JSh_e3;PV*-qJEP3`lN&Klkwp|3sfDC2%k2aBHzpO(}%_ z+O?gn`*5Wf1`{Ev_!-ck97}1wbs;UB@B@>_44_mtW{0z8B{{^_s|dbevGZSZ^1u;n%O5#-N#5x?az#5!VSGx!M#1xW{DNiNO4Jw?f z#r;?t)Kb=b#^)^r5NdyI>PsfN2Q>utjJN`*5q%%*`4RaV#rydot)@#Cy2a-rfX|l| z{DhTT01?&3{LK4YvKG`*Q(Ljpm@{#az&5W8XsKJZ{;aj-N)PX|J_(03*%v;61%vp( zL0!1{vp8p-@rlnR0-EIX7N?9!A29ud>c(W}!7gqj<7%hH z>^4#PeHJYZ+Puoy3nP3XA_BY02-1+6PS;Fmp>BX$^p4uXmJMJ71gPa&^3s#*_0yBX zu?M9LS#Pq61cdBqWH-J1?K?;K7C*p~-}n1$=#jb0$^zxZ(?=7$j(2xSk4>nux6nZd zB4h(td+ILw)%wiE2Ry8QOkB-k4tk(&>hGof0sI#7F9C|`aD#SLm{n`LGOn=yK&rsn zJgWrX_;{YG=0O-5zBf~8#yq)ISrhKnsH;E|!1Y^KyA8eIY7m_bgqyr2xu7rW9xPOQ8b>EO}8@2Esr0_-@qKtL@eoa=G)VrQq zEGwLFU+>=`)p`EiepUuo8F&%pkRJm`<_(;FVfj69_;sr>7janAwTU_%t2&VGUvlb; zR-x7fs0>h~YBwK{O#$`0pm3+chUVavY(APQDfdtR2XVV(~g(7I><;ue=i--G>4?cA!^9ZQJ} zxQDd?_uPsb2?7RXoX?S7*4E~H53RRN(6z)i@O;VzqrkU+?v|z76}sR3`y-m5r)p>x zT$AR&1=MYrId9EDtQt9WsUOHIa(d2GIAmlhfCrD;)0o;`vXaQPNM)6NyRcS>KrqaX z(E||kH+yzE=s{nc#AOud)q*rN9rF4J_s~Dvb8&?~PKq#1xAP2K_aBd4w1S}DmS!5f#h*91%JXoi8HJardk8K=^z+>s zII8rxVeNS%T~S~HnFw9TR#48KCBdzc5>3_G60p=QI>+2|X?~`%eff{tb}s;c8|}$_ zz9;8TkyTk?r8QWa&K574rlbF2i-4G<52VXw`4XDASSu4*YI=|D5`hqlRpOa%;ms1E z+kA2L!e-jFu8AUvV3a@P?7pk6&sCj;wo~_c8gL!v9-Zg)6=tu(@qN_u^>y> z`7tgbL2Q@Amrza~oVGBi^L9BwL>E)s1Iz5KdsDVm-I|wR!LxaX!>#;fBw34xa+QE8 zvYsXreYtt?Z1g^?Dsv;8GxZ^rLExN#4UQmlW{9r$>E6Kj>{jX@@wi5XB^^Q|=H4x1D2@uAlfOI2Yzph;16WyrOctE|hX{%~^N2vfq zif$sfveI=B^R*gStod&|NIYV1d?lzE(WNZf{p%}VEq_d05&Nu&Q!QHHEc};SKS1m82H$D zX4=vA%3Ygja446drCbZ2UDWwIc467yuI(X!gs}gqNlZe%$TS~T|>(97rl4tvEeycYKU9O+5!1k3G4i!1lhUb+< z#6qG?)%Tgg@PwPbXXkUJg$s&5f3@5E{n4BwJ82}$mXh*61TSyDEesb8Z5Bjmgt3*Ux3$3fli>tpgCIgZ}* zD&_JXkZ%dw!zrhMW$%3)bXMR>G4YG@IV@)TeWtHiLnuKAeV_YG-}19E7nN;&&yoN_ zn-#&)d?vyz(EDfi!t4ySQC%p_sbze|4G?vSW7vJA=~2#C#8zRKVYy?ab{HY#oZ8`! z8Cllt?R%X5)_(~o1%5{@RBpHs44i&JaMN#)KhVM_zj0jVQV0G{2<))%NLxO%tP_Es ztsOGUE%>i#C*|O0>UKna6~v(T;fs-!{M1o@d?lT_8)AyW98_MBi1Po`fZS{9;apw2I%;2!<$IU33%fPOz@Y?eOthWPrx?C z)yfF(L{8rzwX8ZAM1S^qL{u+S4NIwAYnmk$5u>KEDfQV%MPooPw>UD_FVK4SH}qj# zNu3QtTQgV{RZyW>1pzR+!O}Qn1-qz2@mF*pbNa2S@ss6Oet>>dV;(r=wHoW}2Hz|8 z6d2;Vm2xKc{k_oHW`o3MG{X=DECdOTF=hGiF9eVxFb%&yptLtPZ9W0Gh?%pRzZxQO z7S?9qV~8_sPYLKUp~VlA%9_DlWTRU1nu%V%4t-FWkI~{m`iBS1_)%~{GE@(;+WmUN zT2%(c7spgK$f_?W3jcbQaQ)d{%wvRWZmNWN($GHE3n=lrJ!M+>HUEa;5IvOf=;UdG z_of=DauqIBwPQ>EgiNyd+I7FPS}`Yw8G&JTSrmwk>DN`QE{cZ#7%ciN1yIhS2Kdka zphwM>-7Z77>MhVTLn4HuTK(i$)SwsJTRlF5T82PfkyzGhwbWA9`@yfWPSae>(f9D4 z`)Kl0)@y{UpuTb?(FipJpWIJWH!DBHt%l0MGYaMJtddwg@x4s+{?8)1Yf2BLY@TUu}gz5L5v21`Quv z(v^F#c|!0l#-#}>x9V0%CUkS7{CW|gA%iw;atXATgde}{pU?yQzzV{ey62ijLz=UK zK_{IdPsXAE?(htiO!oqTwSP;aKYgy?kob`VtOb+OGV{b+A;_PS=?+|*)(vZoPS`u8 zTTG;v#RU8J9B9i?3bquQE4T(^4GDs`ZdK@M%c7j$c*VGb-$=vg`(UAS!$>2iQrYtr zggGm-mPsKnH1cEk$0vk+1^(`3&i06rYxh_oN95KLy1#cf|M|ZizsDQJF)kSOrE3iN)kKRu=KLR&$BaM0$;>9<1EEiP+o*~2CGU^#!+ z<^CQ5X5tQ5_c$SGAgO>;IlohLCeL46p^Q->!;?%NGwCU$vX#xT#jTB^8Z>VB%(Mc{ ziCsiiD^0)X(3TX9jF9B*CBu5DbZk`SaTFtDU7>Bj=Y4)OekAKEqf^DvjyCG3tCl?6Z~7)^7`fYl@JqF!{NNf2ivBYhl^JJthni_(wkj`Gubt zqp0DYRpfB{J@@;b0PjIGg(05hEl^v&<$??N!65A4TfA-jpZ_lwH;pUV1ztg*@|K&^ zYpC4c17s?y>(;I+!9s^`0k|#ikMgVgt5D`hTT?Lp>+^dt142mq+ZXf@-~=jc)=UaN zJ%qbA4}zOD@c>&uq`w4U!@I^}!zz6YUpiXmEM5wP7&$S1ikg4~0EFj!SBE?Q8&Iz7 zF7u&9-o~@>XplzUge^1BYiebkR8X|_zsQHhK)t5TS(;AufU zDUyc6Qw}b)&r`8>Z~;8zZZc`9yE_Uyxykc0D6I<8!bAlGsEA>GQVxw-1W=)Sr6*mu z9yJ~Mwk+$XtqM+dk{DhBz*KzlHQV5!oaaHOU|+jAE-ugekeZwraa^3ZlB}KQQxg1f zYMMAbsJfX-GNuWwD<-}uk-EZ^PDKn4?P^6;kBaJay_i+D+K?=O_+@jq+AkhL_SsS3 zH5V{Y6_ac%EPSM7ct{d(5xjG|!Ov7o=~Fd_w6Qo_H#eTbnyEQ*UmO~=;>+m{a>j@Y z^P^X-c-ghfb2NO&KeBbaS+A0)3_Xz!RzF)+9q4APOb1 z2qMLwQl>k2S=p_qwQPTd6)EnO*H!04sk*(Jy4n6q?R?WWq5^(o(y)JM`_|X_=`F+u zLMx7XmbFV43)+2(EGwAzS1=J!=EaQ&m4rqDsPFwS+AaQ?+rE*ODOu7EN)>t zfYr`@K3NQvvasdg6VZ1^=p5kAa&IGmlEdG%rCDh1$3hkfIX}aTnj&}CB)z!tr8Y|% zM;J_{XgrZ4<3-}S4F9z**!BEs-!19)5iv}iV+Lwht*G%n5w~D-Q)?)SJ8*u+%uD!m zI1CS0t~V8obb!~h$7J6M6YQq&{ah^@){^0~z)b=wiYVUW`3tIiUxsF#x@~B(luWdY zk*I|A8ES*#tch+{8`k`3UC;1wUhe?9jwqWB^x{^Yb;?&#L&d7;`$LmRL&nUQxyss7phmu#IZsI}}teLwFGz0T{IwiM) z*W{rvkqY_fq~M2KvJ`6|hF2E?ke*!pz%iLTSS@m(BgQ9%DVTRbWSFmbtMg+68{ia< z_PM!1iS9)<=>nrOU@aB|9^j5@Mi^zS{M;DJJ+*e6KH;fA!~dnQl?8L0sMro^d@G$9 zgv`ghasEmrq?_yuZq??-0dgk`c(|%q9HIR|VXuH<8@<~MUX;6>J1U(&@l8~f6R<;-i|S_C}QG22Hu1tKhIq- z_Vb*@TBhvG$9`(V0c@C{v*x@|7_Bo%Iw&U$s=WRU^wjldCsWD?6sAc;bXOT93YZ9M z69oh&vyEThi%A2f-)ET`ptIVJ#YNiU_X8umEgF^!xu07IFF#Yu0nVTk zRNnSu{|>zLRntt}XX&^AStc*aHL7WB>ir@YJPR#=q-qf^EOgxK=9KRNEY0Ct=dU#K zum1d4I&cbr@)J@ZaLMBolKc= z%X1KJ-6sr}WqR9`S$B)8zP;RD5OF7|$Rvo6%gnQ^WF#wsL&%O8aS8f-i+f)62zBco z$0{>SK%99?2B|>rGfH3Vvq}NZbZXQ_qW8d`8`6>Tc8FSkz7#mW1WC4g*6I;+{{|qI za%GF)_q{>tOr`_yo)Hx!wGv;!WmE=*;^f&CKrT|@Tbwg(BEM{`R@&5U0x%dnc{>PT zk$ygn)TUhPh?e+%Ho^k#lucchmIy-UFqfPMOwP;syNnEMXDb!{R@IQdFyyUNT7RU; zzM)e0Oj7J`8~^zqKCi?ulQxz7dTy;^fT0WAUUB+TFrh@c6(DZhzg3okBfvEXN{$kU zx6M>w8B{={Fh9;g$?{BZm|B`RNxs5!JDBJ8J#@B^eO?iagyS`J*6Z*KbX1fUUZf)ca(mY)_A1ZEwMI$3EubCpRHiMNT><)0_K!(R0Rn zUTwQBA?Cl<)0wH{Bek#!#4S8&?Nf!8E|nxGu`0D<$F^0qKJ()+ueo=jg8pm}Dia+s z_qtaFh>-6_BJ-nvU5jESXl7pDCf~C2XVUho64CgipJL(jx2#6~Y7>C^ckx&$(XAg> zD6kT0TW_H00hR=w)T2q=2gFl13{yJ*f%F|V(uJUc4zB{rVD(L0z9+yYP_W2^0*m|W zc*?0dO1o5V342Ooq#y>ND|grHJ!U^MK-Js8jCr}Wh%kl<`hZM+%3~5;C=OD0r*h}E z9?p8Smu#?%1F(}spyE@%q=>i^2z|~u8&{)|)7Ru)4&tG6_}*%~IzVatfnh|e(Uvbf zMN2i@pjt0#v0Is7#oxUa=oKmd_Uh$Y=%O3k{<&iEX2(^x1vC=7*w7N>c3EDb?*qHM zH3?T@F6K}n^7aRSVTK&)Z_Qg%(|+i;A}||HI#o*w<3jz?r*WYxtsdnOb9YXubIXQ{ z%pW#hT%u1hTffXLPjclfm-@+$9Ju>x#GA1mFl0*YGLRzIXNWALa!c|ixsyx1NU0uw z0}Q&7(D?cBfwQ?dv!6^Bw6yF62Ddg;1#9+(c*TkX(6@Ncqq+H`WpXQw-(oLg8xrsBT+8=HJ$ z3T6c{LAw#N_^rbHcE3yqWtRo668zu)KVYy=bkD0Q@9i{RRO#(o&g0zA{x;~)&@b;L ztmHD0J*k)$aC_miK))*Taz0p)%+#jCVbyS-1$pZXG<$wt&}Zl#oQp!t1rNrZpPNyb zjw`7P_HIMB5@{nb3$B>+x+ur$iEcyh16Z{qz7HI^?{ho(ErH7ZeTtN)FqJ1s5^X-1 zeH=3ahvf=>_-Y#DDqE@LcK#VKlFwK!_ds*@xzKyEDsKTyeo!Dm$LIbW`j@54{RGZF zCg%+L;wQc(rDTb-_d$R}-2M6|^oEuz%AUVc!ibJwl`mM^-p6w=IRN@502pv^_>+uR zbxGx(Ew6-Ca)P)v`nMZt4OuRjIb#CPFpjhm$7M1NR&MG0NJ5@f@a6Y<>I)SRj9>6C9{+JxEJTBfF*T9j6Ak><~)6EFn+^FITc zHHBfp;hu$7(5+%})hm5I+yI0|qq9NIhV@&_4r?nt!MQ*?KokqWpg|-gLi_VF?lOLZ zo%eY^1`$)3ejD~@825XKKo~-|No4nPRav4O^nmr1=HujSB5>L}31X8z(F-SmJYjD#TL z0-f65{1yP*k`VxK8?Fr%g+Zv5JgOr%>{ zM|19emJGSi7tG)1^xrY@{Z5n~&_4q4LIG*ZPo{>21V76M&_Q}QIPnS^{wxvAX%DIb z=Qr5{38+n3n$*^oD*&u;N-O(uXdzr5x09bs%OsHWwQ}_TN5Mx=Uex)A{OL5SAr2S*%L-T>IcI>a}^C) zOS`>|>ArNimMMpfyuZAMCi0QjZI#3N;&&?KM)f_g!US7Lx#rhY1Rt!pF{vf#3a4Dr zO{s3A{2-9@p7$|15y`lsPgD8v2ykUotnJ$?gq8_$`C|*e#dr81?}RC$iAkgH>x zSLy!}nUv3a3%02iY58`+o7< z*Q67JyaWuaK>+6ggO&;j2;TX=S#}p*{Be)*mz0b@=iy+p{NpZ@ zpI0h8-MBm<=R%vkYrJT5gDc2ST%sAuJhp`nlT(ZAEk7Zj^p*jbtE$shYT89GY{FgtJdd0KE`VC;jSJ(|)t4!=grmOu+%>7wmJXHURcT zeL_>$gEuMUBctq7G!Fyr{#6RW1y!sy0I2enhk`=$xsGqpJuo=w}vZ4M+$0Y`x zFN+}(VWH&j)~;N9g`N#zv%L$tl=T`NNZ`paUwAhP%lr62@i7z>kR;^+u+7tp+|2Sc z{kXWRxSe;`&I{5HcqVT4RbcYDfBRGR93lgNr<9#L^Q0>>)Cz;|4sZ0duqwnk8m)`p zG3u6@d%9t;>uW$&i-TBPRTTro7n^8WxKm=v$S&&kfV@?JgU<1IL{-#Sbmt=8tU2@` z0BI9q1D@$Tiez>`<6=K_w8T!fZmuLaaror0mJ9r;_JWA$`J@|?X6ilP9UOO_A&yv- zdU=S8&JG<&tMw%m((frF!a%`rClg|sM><=^a?2+>%AfAWn%cVvseDjew-X2BH!FoC zNU%>;Rl>av#FdMwT*(Z>w@I^t!e#I|XP#T&SJQ*+_d&v10=N#G5Gzar5H7e1`Q*sT z33jkgsf_`qK8B$vft2e zHqx;C)TSPOPu7>Un1??yf)yTIV2wmGs)yJQunz;Tzoh)DUkv6_099f!(%?qmOudLA z{Jmh*uEhqqjc$aCCZ);;rFX&tlsOX$z$i$2VG8>u2D5#n$w<_z`($y2Ls+rTNmal- zyE)f;3Z()tDaa7&_7fbch6!IjQ`D4&Y9WEeGNC-g9OjN4n5^K?r|4YP5O-Rf3GSGM zKC7S8at~Ppka{9l^&NP?W!}5($4YVc9P~>aGnE&~;g8YCA@QKWFKd0V@{BaFR*%qQ1@W#7A(HUjC*@~&iY&y z9Z*@8ulA(2$l($wz*>gri*rG4B}%x@tA9hWRU&@mYEp43&dF^B z-)Ei$VbP?2WvOs~KwU{C*z<1&5Cv~9DzJ~H{8Jmfm7u(=--t78+U45!kJ%{v)j*b= zp5uSBeG9~WVvxSN_da-LtMtfk;kh4dwRb z$%1swKt497+MAB+HjUJKfGI8x95aor7}P@GZs%O}D~@@Rxq~^ zA|X1S<60BYo+-m;w5{QG2uoLyT`_e(8^z^SVWfHDUkg#is;PmAHj)Bwxg72%7(Cyi zh3G!t+V&YL$UQ)WpH`XDa(`Ok`eVg4>AucZZ!c4;kqF!Z6bt^LfanpTxMt+BY z*zZQ5W4|X?;QUZpSRiz<+69Cfev3T8Vs*CUCtaL}5Kv33MF?CRZv5!HtbnG1;_0e3 zS`9X_Sh~cuN{BrKiu6#v8snpqz#(g^TAQvwte57+;YdZs#@Ir!x>Mx)0&-$Ty|bGY zAOyq$w9^H$b{b~+qYM7R{UFKS+`OL`i(XDy1_P3#dd$@YqrIghZAuomZbG#qIM^;^ zu>~80)Vx13Ll{j=5d$)~8P^+8ADTd!KR2Uj`>4o1y1`Q~(sa)B7F1Y#0uH)qEvAow z4H5AKl>e|F*ssfoiJ;y3)lPM4c9PD8pd|04kw$V#DCJ$hF-*>ssR`SM}l81#3B7RlO>L$sfE5y5+ol^6_mZc=G^z%dQZ928fy3q>Qb3vEx!xJjB z)Q6Chd`LWsMn^d4GbSW6;kQdvzHpJlWXaKQjYMt%^{As&ic z$cL&Hh}ukBddB7?gO1mWefBr92p}eV;-90eego&!fowp}j(LjSH77v1Wl9`e8$~bM zdd-uytD>466vh5?QkQ(T5f;&S8h3DmqnSk)7TnL4RWl;wXEa*8c%sFWLXYL)7P7_ z9~^iglzEBiLo536F2d&$a9qH=_D|&8dW;#}#lqg= zRvkVl8$Z^Sxu`j+kVVR%@uvl(W*JX?c2TESCjxc~{F}42rfZXe6_hRXnuy+JV1mjDx=Pi!9Hjy%4*OCYo!6fmxm>qs2&r5&byuq*ka?W* zeoh_VM+*|SFkQ>|v!gxJS3vy&UR20+W7sY&43_~A+t9a(D;|8MJ$yee%EUcj025Fq zP|KTN=Y6egZr7DC4J$6j+!_@z6-|w(OB$+qaE+Wq3vGLQPU5qgz9rt06Y1}7`X?e~ z{rD}zXsY;@Ns`OaKhtoD&>e68f?NB%tW<_XQW2)N&4g~JmIGO(%gEFe!jze$Om9by z|8Czr3j~N1m{GkT!7U4mHm!s`X1z_ukwHe5Yqf>^%*`P6em8uZ#bhZCeHL6<02of? zjP9|vLM=?jsBBvJN&Md-p(z;h?|2%Yz0IUIg=8L$dNa)!?FhBB7P#L_xIkniE-n1N z0!_X3J5W3zF%|aP7?QdYhk>hSpJiY%dv2%Z*C<~1d8usCI-ic)Q#iK!eMYoaJ<3wV zg@jF;x_sWW6<7SZ`$QL^dbohN)G)OKf0+a;pupt(HNqfpPtCFRd3{o*69W2gs_G=t zC->)f{J=1|XIS2m{DbdGyzm3YR33x%Xvt}d2+AwF#(DgtmAOdOpr@(QYtW&E z0Xgs1INMp$T*tMvA`WDd_U?L9vGB6==Gdg-?YgwbYd#o9=Q#$8tgo+<5X=G1 zs_Rmdc)$}mtm4hW&!=)4}?@1a|R>e>nlq?m5rdcMGzb84U7E zp=AT?PUVIP%DCIs*Jh0iqod?I!*i~^I6#aa3{5uk!Mc1CTb&5vYijBaO3#kw%JT27ACk7RVgDqf4*X|x$E8_dDCVV~URFF8c#<_ zP4u#TUmE3sp=5qnqPS6YG3PPr$-L>eKm$2Z)MrE7da&6scn$n_;Tj58ZdTmz3qf~i zv8rzaJ1N$Js%L{eUHS}sDsd2JF!N>jYY|2hY`mW322 z0=dP<3^_{zBEjZXz)3$!LIDD)u24lB`(wzZon_)p%GWWof8DOcNV16sj(6XpyB=%{erTDz+seD)WFZ*4$!pLH zz+gsUCCqhlHoyX3s(|n=`mF5>APU5GvlcZ`bJQm=&?@x??jp>z0Rz4%l+h?ni@t@L z{9Sx8YZ!!y8Y}K1r3SJbRlQ!sAa|kVGG=mCE0(hnA^O49C+J%9_rKOthf~cLBpK~L z(^vtqCXa3v#cYiiYu9>SLZ5iC$=d^GKL8dZbHka<)H_W$yNstnPPx{sRU$Cl z`WCZd^L>ij(l!aittP+MW=EB6fnwe|P&GvGSI(T(E-JiucK6aG)QQ9r+PRgFN(&hrbb2o-u5dI zd`~S_QS??BflyNpHdN7p<>&k4mi5EnpeEGtx##;NCbXK8Bh*;XLhjpHoFt6dZ$xRx z`#2@XKU@>LvuMKwC*MtkX)EcAG}l6Uo|m_c-pA3m367vG`94@-Br~op7@;NCm32G; zo1W*^ub?v8_ZLGijvM-(OoyScz0*N~ZzTd#_>z4a7Gx~5&la%GpJ4!+liR;hPp=ra z%fza%2ndd3)&jRhEIEX-g;nzk%bY8s!mJhw-N1alk9GTsW)yJbQV537V^BT)oPEovj=zI;VRgM_~!Fvo?Ba?0Rfy|5WzFsaW4~WgYmgM z6TaU8K*B`!f$RQi+3ozk$Q3llj@A#t)VjF++nWQLP~1rkQhm#-4pzi{ynz^r6KMbj z+B!o7-oIDV{nD&u6o9BGG-0^7kVEJ`xK9#` zTm`b9i(2$loKE3;=I7UFp=jzq|4WnWV}tvQ+u0%_u9#6pCagV90XvNqdSzLo)a37K zC=FBh?AhNqzO78_0t8&jlANmVWwpk7GXGkKO9xean~u5vO=O}z!?+;dx_=_ONd zaVk+XpKm!5!YIs~-~KE5dvXRedoe|=;@|t^_igX5h66Pxe!d}5`fH@NM!))b!}ss+ zIj<0ExkoQa%mEB6djlLBzu&39hg(TfOxXhB2RD#X699nB{VeEdC17|(K*`TlYFvs- zegE#M7u51MQE!o+PND_n-_j5s2V2V!+KRc&_k$~ofoqDcaY1zUWBHyDNn=DRzr8## zk>A5b1!R1(+H=e08r=>cSfCMT+Ixc32GUSdAWVX(D{Sw1;S;3r8)TT;0 zU*TTMDZmU7Ml=Wu`-d^P5F#2fK;U*jylgPK$4>JWrh;-gb-^-nhFWn-TWh(u44)P3 za_jOQSQTrS0$ihCvqI}j6cB6y)F^P(3-ti$uYN^U76m5)$z=!t;@j9%+n{Mi+=xz3 zPt*Inh5Lt?N)rAyi79Ec2QgCvf}4X?2qZq=!ts`N3c`KBx3I^2KfeakMPEoZ-5hbK zRK90VTv$=Yuf@i?{LlaQ3qam;8#S0uf_XTK6}p$d=8#H)a!eupsZKZ*FzD&5tdI+r ztwiQEVq_-R{YbqQG<{rjnA5R+CqQiMc>&JdJWVosH zw?mqlqe0Y3ECi^5qHM(dWZPUqw&G{E!Qd+BEilVa^h*6|aJ_QmS{jFR9>-Nv@laK3G zZtdYT;@{?RY?G+{-Nd)$E&I`XJpZCB&|)QBX_$uFQ*m`L;^r7blxwuV86kvj%7IXC zC1`9;vH;Q`vVtq9P3e+#T_lpl@6-I6aDNE{@;=<{AaYr|rKIX|r7%ek<$nusEzNpA z)0>(e5QKP9A&L8ULu8&M;}!FG3%Zr93Kp&qzXzk>K(K@nrwA8TmdgRH;OzFWOvGi* z=k}CC_;bsc2Vs`}i~_re6(BE3MWcT|a!P`QO)uoyp1O)?qyIT}@N@BdvhmaPGM6wb z3I6+q{EQx-~?-M;gIrj0mAccXYN!7`4WBdVhq}ZS~uG-QKX&2<#O6ZOr#D3@`D3L;m zQ0cQg-|dTAiAxpDNHdtfR$qA>OX8C9EgV9BS!D-@M-BNBxJ^%3g&e!mX~4IZ+zobN znek!f7WLPLXwbJ}grtLaLB}b=wniA|knagpO0Qx<-*UnT3t8mWzcmiOWEjZL=^m!pWPT0^4k%1eYAxmjVN7vFCY6%zXpnV{llB9pv!EK>8F_<3&JcI` z5I5k)(#E_EB5t~+xtcvPMXr|v0j}y&yBPV5{%@p`6AEeVi!fbY6aS_BH^ev+Q%wx_ z$Ivoe7pE645?|}gtpEiOktIX`7x8wz;wbZ%#zWZR4#ijPWFOjGqMPJ?n|iPQ%4Uo|U0GSxo;HtaaF4;YFgb zAaP}NO;g}L6_CxmaL=OlS;)Sc28E#YT~vWLDiqIa_3C=f)#Z@b!F~(Ri(WBdP?6!<6r;l$DBCNiVo4B{k@+EVO5OVG>(# zJ7pk*1%b|wM_q&jDhQBXa!+`%O}dg5-S07V%bZ4_J$8i{few(WY$Btx&lNCF0xtAF zFZ(=MU7@qbOD;Qh1@8NB)MF*;SHJfAGh3EPa250qpoungtM1uaE^>^tO`0j2ZTNhf z_iyu_A3+i06b)?|G81UdV2>3aCue);pUnm_4I7#>1ZuUK$8g-FYq_1p+2d@(i;9Ld zXXhaI!zi?C{BS~s5vDo4j4O1xr*7$%EWXbo1Tg=+-S1%b0m#+gaUst@`};B5>-4BL z6(Un6>h|(CDz0!l{(M;p7)N0K{vah=|K;4L18~zt3ki_*v-8;l|K#gRT5vae7_- z&d*!>vpEFipbua(f0R`(C>7#`=l`QJNStbfOx!Yl$A~%3`LxTt906r|dIk=jWyP-UO+kU2YeZPemNqiRx2 z2i!~^B}z~36BFN>_cJyIQS*o+-E7W~cQxOXmv`XU0hA#cRYq-jt4Y$$q92i!ZA5S3 zRl*MoVSOmhYwdsj$1Nt%o){GuCwok-%+Rb`#Lv*7u-0^7x~JyJZ(hRwfL1N)RfnA4 zu!$+~{&@;p2|~s^WQNn^`xpT!kCyC>`)vi_kib5eCk`s%%9S&&CEufUfyQ}Hn7_}0 zwS<;XwB`LB>QFzqHr3U(hm$Ob6&Q2Ph`Pyo3#N211WBFdBW8+vaJlM zBxQt28^%9rovrBCOX3=^@>@*6Xu2FYCEzyi4|}F=?Z+{~epyc6-og|{1tbIz)HOa> zZDVRrvZ}a(m>~mh4fin-!CFS-YKwuvJzOn%dw#u@DPAfy6kJ)|u5ejqn5j9?E`@+D z3zlvnzYCkrbc~-V31lT;Y9XHmI86?;EN=m|So*nw5(a>usc?UE&OksJ<CY}2sRX~x4?Ua$Q`cuXpQ-rpt)y1VJQD!o z_w)XFX+{2cu_eQxducL7;8pteo^LxMu^q0qrp+JyBktp*f7b=tlxwF~LZaLAF`a-4 zey*@;J3|E{0KZS2JPR5P_lxZ<`txe=TqT0#0Kz_-zL158Kiw7jI7nrKitaOg5w`hX z7dI^?p>}J(;Gx!nxE9^uU!=wA4p&M-(&aLN5Qw!*TRqKLM`lVo{ZwxlB4r7}N8WqW zpSCDQFfae|U|9Y5s}MSTN-475`8BvuQG8KRd=wHU{HJ$~Uh((bRJ2~eY#b5`0AHsA znajp_Pyxc9fW%z9S>MT*nyEq%ZhpUUK`exgUk+KZ6Ae%pG!3L|OvGQ4T5hqJUQpB4 zt_kX)a_k8D@qY8yucO(+yhC8ixCU?}=T@FN zri-Q#ee4w;WAywQ`eO|S+Lh9iu0=>MvNdmiJ!J^HL>^%+0S(l`;;+n~n*Qy=vt`Ov zlN!JYs|78902=_(MTri1^clsmdE^{BT&fXmlTE!%#DkeVsMbb>E>52-Hy{L97s=eV zGiN&|pvn&R?=spdURUq-HMSX$ymW^I!2-`p5ay`>n*py31^cmv?N3Gc4%prmm`o^& zcxMaeGCfWF{NCOO3Jb9D<3T`)(TRZ(z;#ntR#xo%^aW!vD?LBqg{06*H)RWd143Bq zq7NbVBGP6Bn76B*+Oy!d84iIPR~ceFG-r>r^CG;T!*1cf0ZPFzxQ2dKU-%=g$3zme zh6i;C>-!jKk*R_`rC!e4Pf>Y^QXyE?<#m4YSrE}KVjU8CX8!msr;ujl{%wEkyER1d zowM;H8;9W6XTn1exZ2>*76_ld$ajkp6**Cuj8A13FtI-0PkYLS26G~qmc9K`e!qxB zkz#NzO!lVsN0pG66<>ry|EerK^yKU&4o2l%Y)rbJgN>02>-b(?Q5bxfZk z_E=nka4EbK=CvEw(6^X%w?@=QHNz~h?cZW5bS=6nQuY~CRep_!fZc7H@Dt(t@IV*Q zuDMpDtT6Sd;3r_3Q0ZjCkMn7!W+3|fyx*=Jw4%eM-Rt7k2mr%o$#oTeRd5mWbFdPY zDZvhu)w8QIt6EgwNx_o03sJ9p$_-(`u0pN$dy!GC@76CPo$~U8GHr7mz{UIp3HgzG zPwXdGKyYfH?aQ^-p_iGw-K!Ef6ub^hlBw-LOGkTMua~UBESc%lQTfG|i}}4BE^WjQ z;8*cT+(vc2Vy*+2JYR`uX7-2YH2WBWOK}hN!!foPeBUhnw<;!!4&E}n6-MVZdlzv` zRmE^AIJ3QVbym9vc?B4vmu@J^h1EKjvN3;m|MNfK`vId7-!@HI;3&fUVnZ(diwc

3~i+h z%L27w?RG}zR#tP|x0Y3d+r5S$T&0jOLcpYg749LRw{H_$007IWp}2h;Vc?GbRKq8? zWVk@S6>K9UnI+wv>7V>WluRjPmgT+_?V?ft3Skpk89_}28lhnW^JsN@8T2B+^X(-` zyEFv8hY(jQw-tT$c-k)6qq1N|%RC!l%TN;bZ1so^_F0iDC(li9(Mu9^1i7znCw*yr zF3Toh*B<}q(+|AGLOX#)*P=%p66x&$6Gd}wP1Ob@z0W|FHs`JMWq%DM{wO0kA_E{pom<0Rps+M3laR@Je{Ulc%R;t)tkp1?x`3d_WqM0a z(0e?;J?QV<3g8~A_NTUcH&2F$lBg`BM*I0|hafp5GnEPuZO;4ru!CaI0&2O;v)tY# z>RfEkzGKii3A2aue4(I%FvRv2(ie$pndxl#^J%75+uwd}-E!Ebby=asbt@U&uE56_ zie-b)yPtV(G445k#Xae9VO{_EzZWaOs2AyP$q^ZZ%jCJt$tD$l@Ga=K=I?{Iu{lE@ zqYQ&(txFTu$Ua&z6EeDr&N>Cf!YzdZmT{jrMVZUg`z$eCLn{nV7D|5W%=0HWskt|NZ|!+Q0j|;)v#%(11ci z$?xy&gmvpmPQ^Hx$FvgO4irXkEJLOO3z76KcW$q%XAi(2TIJk;W)H23auu0CDwqnX zD<&X05pO?t&nR>DGYr5)_!s&HlbUd!ta@*Crhn;=wl+m1k)Ic0e!!eDsrT>tF_tMS zTnOwNlB-`5PM9KrWn~0JwdsqTY?(=|Si=n4emouA*7i`D?)PxFVQK_ySm9Czxe9(U z%dMq`k%@|r8E8{i;u4YN+P=l_$0nfsd!sD)=d1ip>hEY7OHQC#i?k~MLb%*U0GNRY zSG~XcwB%OEWJS;iJFQ>?Zchta?{~swepkO%g|%{$C57p+P=*}lT%Z^8WIh|Xye-{w zWr3igU|XN+6~fT%NT0{^nc|{odhv{~J$~+)Hss2s5}}5!RbTY`T!8Mu$NM9`@|MH- zyr0--(?dU-|32y^_Rs~*;pzj%32V=f@7)73z3(w{ST1W`>~>vlz1YbNYV#6)U7m84 zot>AbM2pSKCg%#=B48b6q(I6nP$@rVaEL^s3g#;=d{PT0P-&OTaAUcI-~AjPC7A3L z4?@gu^FZCz!OhKX9#3)+kCEEj&CUI3?;h)a{rO<=Uf34M%*MfQGa_)YP%*fOU# z0>#>XlS4p#7e1RXW`099sonRq0rZHgRG#NV>>Okw7tN%_;+KsFxcX0xLo|MJXz-#m%L{ z)6Bl^yCDHTs$DVXCOG(&{I&vHz1~G7$*i0W#;viXbhXNJvBqIruYoatZv(?iw< z|23@{Zm=B$+Ao7L-<-GN7HXeFQE#s{!(pI~7V$HrygGWZ+OTx18ykQ`;Wb)aVX*pL z<$mB1!u^g8D>##YFO~^JlzjLW{3L6k9Hd3T47zyhs7nyApjk3B0YN;ejVNhnewD*< zI}hJqH%dzjrq5{7?x9;g_}*`sUUs<9{_hEd)5= zxL;#|OO8UY(+&L)^ZAw7<`;!k01JSI(se1M5XJ%j{jPifn@^J{ZWsKxdcXcA+!~Q$ zs8bFFO){HY8(ZycT=X+Az%5w01FP}oq4*(AAl0ZAa(c1{+nFMaKkJVVXDk`O{( z6jPCME_4t-DYC-OXdPL5i;qz3A;#xEaP*Op2mCB%QNNT8Lgc)1TH;R9YRn4G{6Z-%6c~!t)U$HJ;p78OV4a&?$o5YLczlg(PYin|9 za}KE?D><<;v~YyF;n3Ky2xz0)bC&v2UAwAA@oqi1ugVw>=mP%MU!}IWIOCp2J_v|E zX@4P=Oma};4|f(ZP#`}T;f_%<;_(CKx_%t|A`MP1cgXjsBBaY*5h&E(q=E`94=S#{ zX#+T1@)~t>_)5ZN8Pu$KowsH4;U~jm+LQxtO;a>70dgZ=n(ypR;O4V6Nt?!d3lA~m zm`)xfq|bQMzSva5!9Z0f&r(ngXz&29%`|Ia5-7(^mcw@++;i9I?uf`f%H`=nZgm)a z8DzL~YG-P`Vt@yWD|PU!!CpW`>QvQbs-;LeHDOAkQ}B!yy-vF1MO)aa%JX!cm$*K*4pT-@7vBY`3aW92zV+0#TdqQ7S%z6=^>LF1O%8Lh@ z^N?G=40TyJ(-r;;Gp$lsYsOp3zRE$GB}SAJmH8ui*}SjR5Q{GRblF;xWr2UP7eWe6T;^3t(#pyr~tK` z{wh9;J_0Ya)=zPAx!Am6nGTmVP;D>M$+;sJ(y`i%LZVQ&`Jev{wbBd79jx}m!oRYc z+|QCe7w*CNoWdfFA%rA7(%Y%*?~{?x^oEc{)C;+S1$3LZe+x#hz{nA?E~aQrZ!IG3 z^Y%;C{$%{N#elcSY!BR##+64FFtc%$Y&RIxYAjvbv6AT(lmywPZ zple92;_Az-D8o*+eh?O>mVSa5kr{zo*|RV~$YBxIMSGQs#?N0JE3_7p?mYJ>3ur35 zO=h2$GEJxlE0AMWb2T zR!a5+HK69~`>?L5xOrK0f(NhSBF%klKy_)HokQ~y8pI;c+neUcH}DQZW2v@}K_=^Y<3bz-ekZ>S3Jc_7<^P&fhzy zu)iBJ$IcT22&^JD(n13foKz5M1qfSv2E{>SDZ6+^w^P3dz#0}4T6zonv(A3+!=81& zZHm~+3j3ML^p^WuEa!)40S!RJ@^5=jCgME7Lie+>bf0yF(XF`64SN;TT769h`jwCb zjBRjfNHK*i4AU?HaLIsM(*|kbd%{}6e_!|v(}wB>#c!jACb0!!8yC`~v8A21fPhLZ z7f5FnCcQ*EH3(gAi39R{*>+I6D5i}~V|({FAQBWDDK5tlQ{PWaLgB8qWS^yUb!PJS zM{*D6yuDI>&oL4=2&@~13|F)|i)ot-Q(JE<_0`$o=hZwL(sXUby=I<&ZumS-zz|@z z5@`#%;)b&JbFG|@jzf7m+!LP>Fe3M4&xO`ivmev^wGUfq42w7QO9>fy-d{eM$b2?J z(rXbQlGGHYEM}i2gFskqtt_`q=pOgTN+1|SwxTd3Fi&78F5E+J1;7eX%D@!Lp^PK~ zJCo0?+gn&}W1H7a>Xs%90t$oT3d*{}`!RSM+B0Ej)`r@27!-%@*u9EyrJbe_1t-9h0)1kctI*mmWhKflpZBBNz%prC$*_J!(? zTmjar_Nsz5ncL=)-iQOtLWQLN!9OZZa_l8Wk?K&d8PMk1clEX(c%TJ{gp!NZ6yt>w4f7HbuvPQ@(gsF2{g%T_$7taTrtgtVC3+5ixT6=eH z6;P}c5rW9GT}HsKK&Q2_riR%H!max4*PDFRxra7=SyKEK_{AhGYJM3!S-~vdvRG^& z{Rv;TaY}FkW3olC0z@E@Y6M-cv8X39_kd6mZwMy(MaPM6c-|6CUd=JQY=*D^E;MV5 zg)JyEr9W;*CuE4t+0BTq#uvj?_9}>gH9N8kW+A%*w*;;}rVzrzOxhuIiSS<1BJ%lF zJV5P)yo(wBc`-8fIufP^)TF08H5s(ri{;7(W?TLJ^AB zYy6kcWRgkZo2#{JEv}+CdvlRHVkI4h5O#_iIb6l*fghJ-^FBnn5xB8)7NqhE-_kBC zIp`}V;jC^MCGfUwDL<8bCn&HWSq9K&(dqJ)D z5CP3RabO4Jeq=qfLKPUQ^yHwN4EPwKveNGb-hZte%ws(4hD@WT0Kbk zrYHrg1(omCz}qBMiiLHqLY%~bdYkLC;u#yYHmNBJtyBj;d!|h+z8b){?Sr6!yAQzQ zCF)+{PU?5xeg~7_&kCtrl6$FtixJt!i5=m{^~Dw#K+{e1Oq}0mhKuc|s9I8We&CNd z2+gAJF*(@$8$OIJMcCAHQ9CC+p#}rS?vOlK1Q4bR=I!a)sZXUfL^6E6k+FD`)Tjs@ z4H&pW)&NUDw7=Csc@6BH@01rBs=ti33bp&9NVcwBM$5x3!pmZThKH!;VPp{!Q?NAz z-^!rg@`6tR>P+SB5Cj~2QekWFFt$oaQ;dvRrHdPf$%888V8~^NXaM2s4~^eydIxLWAq8C+#$B4dAq{{7 z5Ff0M#4U#V=xmJXAM9cLNi;l!wzTEn%F|wOZk4a`R0gI}0k2@lN@&1Y6RD!D8gaogs5<^}?^NS+YdH0@ zB&xjcB7Yl4^NXg{0+s6UtJ=UcLJo}T53Wm%-6gQp&{+ndclUM_ARy%zM`mhi|EmXp zDq5j$7h=Z_^9TxGJYnkg;7MM2w|*eLsuH<6E{{+Ztx@oSa8xdwa}{jXbbO*~3ZLa^ zFwxQGgr+Rc8W5afl6uyOuST*8|19@S@&euyI2-1uOf^yDBFk?!LZ+Yr7NQU#q*W@* zT9J_dHs_uqzhcZK6+0;vtFlGX2LOt^`ethAV#T+voFz%Y%AZ_Y@>YU1S~0GO#IfOW zv{fQ0Dx_0{m_OLhK=0!!sehub|SbvY5^rn}UTPYx} z?8z;r!|$2nx@d1lx3M66%ovfdzqg@Q`4QA5K{{uh=Udo@15}tWoPwVzU8XO4BE8T0 z!9R`^Ww&MR4(`ef1|7TWWCqI(vIe8vY;FG7f$gSUSM#_uY48%X$Hvi!$+c*6?T2*B*%qdJ>O3XBc!q#zA*t}^Wb3#2Bo zh8A54{~+R~C1~)+Jnix`b$gOe^(+Q@Q9sk|_~$wo44P|#y5KYMgg@POo=NeEfI3yW zwRY~})oCiwtGd0bRff#@Ne(nWSzHnoz{t%ct;}I|MZ>RU7C7eroVuq)=0l73ok%e@ zF1Fn)JQKq^8-Rd^g)ZUr(L(@6eqX5)8s%cFWvcX`zw%6zKYWiB%m|Aqpw0YSbZw9; zu+uaDz`(5>8$B?gT=Wp-Mxbs_``EGQ*WTZ?q%$o%5)=$N0&fE#85Vq z8U={Y5Jxah{_E@dBL~7xJI@q*ER4l1oN&pgkrHz52%j%ed~<38j{pdDwKSQe!sW03 zEj4`h`&qwO-?e7V0AAHq4C%6=uFn=0ct>FHktVep<5obCysS;=6!T1#n&|H;89Ima zyiIhcW`4tVGPQR@9>50ow=4I|7k>kG(e+YvO1fx7fa2cA^jq!ZbbfF?YXm~)d_}y2 zpNnrvvGMb)8MYx*tOa+`=bD@r1;)%>D$>_nF2G+?PdVlOYuMdc^Bi?5o&duU5+e_u zvk{PD^j&}K!22@k&LPiY?ZOvL=r75Von2Dc&B?fvFg2}BM(cfGbirB|;qbW1wM-*a z-F&<@Q1bAnjhpHkX0^n=tA9`2C@2;}x90;HVKSX6Agt`*qLH4NovjXGC+!XS8yv(0 z_pP8f{#PN*Ibjie9k(XuuQ^m5QNJL3XZ8FTCWhCcx3-!#>pWQ?%W_KL)qO62cUZN5 zV*M>h!y_4qO1w$FaHn~d?!HBc4(|;IT%O<7tupwxGHTvvkawZlFsj_dj>qu8FxbWh zkCTAg487cY?G|$boQ4Yhz&FuI28(Sw+d%E3O5N>Ma_s7HVYKzfnMN~{qN!->a%;1V z2ap{6=l=*I0k_`!G^0xrqaw_NSoIL+mggjq?ul1Gc%~NMsJJYzOUr~6hbOZiV~RDd z1;mgG^vdr ztt*14;(=fdTVZw%~qt-3X*1H>&Lov!68|n^^jnu_E`}wsZIciI&}D;EuOM zzS4gF3jO_e`$1%$>EW@ro=-8Z2(}Nw`*@zq^l%TRCnzO@=89>r43^W2(Bz)$w&rmJ z+NA}J-cT_?@0n-ja|CfsBllfNf{{s z;@{)2l3z6&VqHjWEdvxB(CSH%!2+rL@(+=Ao@~(ym_=bL1cEK=Ctd$5EOu8l{rw=u zf%`2uH)%x53jD_4^){|{e_>K!w)*GtSvy4mTVh)3UH(R}u2sWy!C51Inh1v?UPG?< zGl(|$>k=;-(!x0fkQnH^e3|eF+yk)QQbPDDgI%MJ<5#2W-E{YAAL{&`j+Ud@PfVY~ z+E+eh{32vrfq{VEkp>e2Y|@w@J;F;sQJ%~q`YuBFmPlZk5S=Ap4-IeM+MUIb`2)K{ z41l8jKnRP?rn$>16(W}gyw@gi?H_ZnO&D^ z@ikeR;#!_mqdNQSu#CpTA?CdUnokry@h{A~d028y1ue)qCDBCVmy8COR=hE*0^V%- zVfUsM{(&I+18G0rs*vYb0hb6n?GD2k;b$(8@^}OO3ax;b5dwPj-qd@<8dowPv)>xe=c*~Xi^quOZ@x4C!x+ieBaRIKBgBe2HB zbbEM(s|v+|SFU`^SAX|_3T9n)8&|GpdFE$03r3X1XGQ>FFDK6XbN5-BY)c8q<+gkw zgNlrcF=P}Rg0Vx9NQaZ3V;VL44FqF);)%LbvjHMd5*Kws1-?Tpg*umGh)_B6MI0nbP*Ub3TSf;QsKqW&cau$<)sOiBpJ{2DZyAr#>( z;lxI*>5wWlRVX_z`d~Tco0L+`Rk0t=L31-=>`kWm%BQ58x)>y5^PqrEMTlOd#r#cji(mw@bz`j|XU zE<)7q+aMyzq!E85Otlfnu>)qR_H|grSMlf|IH;@Sl>CJR{)%_;=FX7%d~o|G=zX1r z_a|-=W$-pu2oQ_NjWp{C9S%$f9(S_1M|Ecl{S-^4RNoU0rJfhArr_cUpGDx&f#t5z zQK9x_CR*p@+(*{}hGAnrVsST$^4)*zlpDrU$VjZJ{Aq#*wc&WXZb*V=03}ahdHsA4 z%uRxVJ2ouCXp*7-WK0~t+ttYSBly-}UuV9H&nhGEX=dozo>GA;Z+aN1{nNvYWS`qs zh){6_e)FD_5xKiuZMez<608+axB$Q>NPEf;?{+C-%_M1nzHUy_NZ1jP62;0lyYoem z!h~D#DyPj3Buw&L7v3cb0Xglpvdsq8hxbBEAY7DA%6q&u)QENT6sn{J0=5Ur=a?ai zX6+-5mS86m*=8IN-r9l_QTt44QsN7L_QrA+c^y=EJDzF>>a?OPGJnwQ%b6Kc;BOw9 zHkVmcw0aqrc9TP}n+q&Kg*#5|b!q2c7BqWhP)K>;4gEPWO|V$Y0;c=kQIC-hwz*x} zWM8-}v{4JxG<>sm34g!ZbrIs2>`xDJNK#Gh8#6uLX$I7f*@4=17lz7%V8QwTQ-mME zWWhUo?x?kERppN$py$A0Dr{$@y~?*WhHCipkh|D{R#4z~R!M#IOME(v(S_1I%DE{3 z4fRduNxn%wx2Z6VASn+!s9)=gWrtaW*>`z-G{I5UzIk3;)Eo(X_{TPK|M1?4R4>x+ z{+pp9v;M(@Z4q12HxgL?`ET%B;`ZBiOS2^_E;w}y$r&iJhH~$D4MOKKwb0Vhxr6{- zozsv}OZWub`q}h^8Ted^CgQeC=^`>qTgzJj=Sj5CT7^XpZ$uN`s%(Rpwx4^Z-d=&j z9{t$z)QUrKptgPN+DKdG3g8szbMtueyq%geV4$66%Qa$zx3^?nH14zRt@=`o&-|uJ z*f{gcZxrWDnpvm(OCn#2HAsz*NtiCsb1)@fL_@ry!Tnym-+}H4SHvwS1Z{>uztYN| zKamc4%Zxu$d5PNcZ-zq#)mOos@E)H{>V6zfo~%}zq7u|wknprv&!zrN%5z*Y`ssmqtt}?iC$-M zW^}SlO(Rh5u{bMn&yWKt5OB{x00=N#Fz0H)JgCy@*+~Pg7pA6*w8Bu=<;r^wT9j~Yu z5vbujuh0vcD7)@47^JsX8KB>13%XEf?jt8=5JE5{hxpBCNi;vANCTZMC%rWlQm-H* z`(Q&Tz%qyUhjs&wPf*#*r>|m7M)z5O)&@2K*2^zcqAg@XM9(=JVzH}C5h$4W#r8GU z0h|;Xz=wnksCuscMA$|ds_D6Cv%X+s z@~-MQWXaeAzcKs_oT7&*s)|2ajYQW~C0JN2iRsHh=kYH;6m`7?a)U6FS3`=hRK~-> zx>lnpq$8pR)zzLizNbI9Jmp=~-z5c4E3>Mv8N#{YFr-E?eg*-0=d8Fh9eJa`xaw4C zL)>Wy0f1yOIOJ6Cq9Z^7hgbh`>oi@|~#QsEN$x@Uv8wm@3AagQ|eK zxwOdOfL?=!`w;UQI7R)azr!$S!Dj7W21pU8Ev+}idKmvq1kY&>N^Su8UVB3X& z9eCri!oq_}A%>Acp|O0(oKERxi@j66h~?P5Tj?fJ45Ks?+7^zo*#buIhDqPhz(zpG zE?VGivt3}Mi#dJ8_8}L#$aOBYs?z{rN?)Mz6g1?sJh9h(>yHWz)Hp+GB3Xy?=9@D@ zby3bHPXYu~*3VP}||??ge~%u;iWTZ$h!4+vA4W{NV#kI>xC4TC&aC@CQ}P8(0saZ ze+L%{^)su7DNIBF8ePS_PtZ)D_DdwdiCGN^z*lO7Rrxy9CkoYT8Jf4;i9sNxTCRrm zgVqp4j3!{g!^7FKKdSR4#d)dcbWmNfpVPwSIkO77Er8$X{OAA3z?l^=<#O>h5@=X^ zR;z7f3dZrak#Y^0Jy-_VBIoczwG0ECOJH?R#TA#~v=K1c)b!iO00TJ$5w2NVn&Xh) z&qTBk5ZmJt5^a$au)fcheHMMo5Z|I)xyNYFS!npX;ZjQ=V%{cG`IYsB?B{!e@Rz

ke|*Da(jvGr7OR_oIr390WH&?d^OMf`dP86 z`F*iyr4^@OFn%rpTJzXv0U`VB+YrbMiF0*(<+dN&l<7zYwJcm#>a7s48}7enK35fU z^Xy3Z0cXZJVJ*K!MuX;;`MAH5nHn+)DKp&q*$u>4OG(n{>EV#rbH{#+v|UJ*H{##W})Q-KZ;egScte3QN2?zl8_lc2Fc<_ zwrqwWX33o!1$Z^iIGgFU7xhC~$KSdjJh*InG0q8*@Q!PBacKL}Hn|-zG#s&`A&O~N zrtVhuNiuiJ;wW1jXyR%(bpNIjIaYst+y$Z8exw}=4hhMPI7RNTvE1lFi)KS*A+*O+@mN8#6^XohcT2R3yuPgR@

b1m(3t+C($|9-*6N3pjX?Z8Lgw-)a9D=A{4@a5h&B}dhGGM5y0_Lpb*}ye)xk*#4rh=G{Yd62uA*`uX7hi9o zSB$+=`rQ5HUL8d9Ez#K+rc%4Q6kP6X)Q$23o(VoPoavO${e^{3oQHgk3eii zYlU0j?b^oYX~Ww;SA@!}AGN`QXVTf(7;GcE=ycGt76N-D5F(vQ0y?a(Pi1Q|O z>{TtX4;B$O1fA3tgZ?X`uyoNDM>NX1jVqMpG@WKkuj%{2{NC(_tQ;|~vT^l+ZqVVX)(Q|7uk&b!{`2KG z0P&k2Pwyhj--b$=>QX>--Uy=h*Cc?XFXI;2T- zv2Yp7mnw)d4g!HuBFHLf7rf>$tAznmTmpIL;6j$5kX()-DF8_%sRxB-1lB}7k@+sZ zt6jGttl~-tQ}3|;Rpe_mZHTi|=~ADuwqxofQveTh@W$CpFxq0e9ixBt`@bP3b9Lg? z5MssnYc-3sR|V*~0K+`OE_2yYX@8wHgTbCG?Nw`g39>VbdRjR9W1t&R+@RrERE!|v z`YDvO=NEcn5~tla#vs9~xP#n)mGe@(pWI_NeffGZB|Ev5pr#V2_-Owa*W+?qe4ntZ za?~l*m&jen=IarPo9<117yVtJg8d?W6{g<${P2v5^4{0$jIqiARO-9@i7)kT)x)^V zE}q^TMB)8yesbMd^t%UGbEa4gJDd*)QQjLBYBIQ73IF^AFGp!^n^SLq=Tr`fOh5of z%`CgLmmA%Ndjrh5)p>!4R&Ga?)^qs$0ZR@G=f>&bzUTrrpG9z@x>0akb$8OcK~3aZ zfp7&7hS}jhbiG>IRIkH>3<4Ig$z1dS1j!B#b^|VO$l9!zgJn_p_AVWjYVfYqUvr(@z};7&989?z+p9EaJw%jK=?OX(_&~)Vk{&cPd1j; zNi?VVY&IhMARU6}rR33>G}TMaI-#9ZP{r(xJXAY{^tz@3)!apkOq1 zG0JH`^<1b8kXPn&>ApeL#Qsa7{mKQ)mV>RN%8xb0 z`+JP~guHGz<2o99&WUTU5JOFisN-j$P5JNmd;g|kX)$;d1mpsuW1ix_a5;>I zI59xeyS$OK6ew8GU1*3(=$k1%zfJ`#H?XLA*11Y(=99e#D^JeW=VrG46^xMuE$d$7 zTn7zBDs`?~aIqxtFAmuRwRe2pELVAYQ>v%*j7^3NxJ@|7Rs}>5<^xRd=SvX?QQzc? zGs0`QZ+F1A&`e1>MdIO-!TIDc6>0qgam5U!Lg^LPrnN4QG?BQ!rnD3Kkh3Bjtsixy zltcuxPQt*4^_A+-^~1fWmv+wt5YddS8i}!ovC;woyv1U5Zb>0Ck?J7;gktyN zs@%p;{=P*uQ&{oSyU>!eRl;`ITaK2F+eP6#00%%cG<&I67k4*~36){(seAB>eMa9HFIuq{_VC4RO&?$8P9f#j?_Ysf);PRAzSH9^NNp@GNs z-jE`PbjIa0&{uIAu4zrfCz@M`;6Le28yi%`uSwh=U5i|W)_Z2pB;)FJCDpUPE8Tpv zx6>zR-R2pU^v{IZ)iiK!sz6~lqf{*+z+guXnuKe^SuLC;efTg8bjC|qo`POLCFv8k zUcRJQcF4+Z-;#~6GA|;mz$Ft1L~e?wZrX`xF>G$$6{0c$2jxN-JTBm{P@%sfokIq6 z|A_w*2>)sW{2bU~t$fASB`IUK`UdY2FL^uEx-{0yW_o`zM}0q$EY_@c3p@8mKtXohlSGmJy1zTrYGM8oM{x-w_b{jTcm1@~#i(8XV)vXdaI=eS6=(5O&6fJO(>4*f^YqwRR`InoA zLDV&93xc`)yql=2u=x24;pwMB9xfu#Wh6Nt)YlffRO&_1f)2>wepcG_QQ#@JS^hykPv;LTX4A+u7Lz)Q!pEYH9u)8?hB2U`z=4wtYo8pEA+0*Z zGZV(teh{orkM5@x_}+84yWLwkPQdK5$%q7f4iC9OWwZsH2B1Va!ukaUTQ2UNur>h9 zhrj2B%>(Ddw~16PPTEbCF5oAB`?v4sb?Fv@W(Xu8T&LI;bYF3i^)p>y90AnTqi!xM zVm){L^lV?@<{_L#)~-ub?Sn5-F=_!*#w$nqs&-XJhp8T|s zAm_3j?)jGcr)43#?GD^it9>rc{*+q&7_{CGmu^mDWgm^Wu2BVq0llm+b<3iaeno^_ z(nMgTeIINdZ|mi2k2(GpY8_lpJ0!knvZn1Hx6gz1%^@4tFrXWM!o zi$DEPTt@%OzLOHH)e6O7LM;f?7WS+mVy#+~HG-k($kBN#@xgUPX3F9U(UR!}r?Ww! z$2cC6`B@V`YMxVr?C%xeTIY5KW&Zp|#;W(ZMYn+_+HSW$Kq>so!I@Vx0BQbYIEao9f)>Wq<38l^*fd zHY^zOHk{}CY~Qmka3LV31*Tm0TtbNeFbf6JD2N+d?rhs>xUzMhOZIM$7{FDRzoxg} zQ&j)~e0wS#Z_UBh`uwC9cnaCi>VCd|Z&`^#3=RU^1{m^ojA!wWx4Dc<-mLk6h<9|l zoah1ckVTRiuudKCUg8U0_I4{AZ5m--FLS(J>~}LT0$@v8$+`f<$1P&bTCX$bPe`!Z zqYi*8EUHUMv~Yl_9Xh{3SQ02bG8Jsu@wPW1nCdkHucJx_``~b(y_GK^ zLsL9e;m;O>D5aKOUQo`~@g8j`;?pdGMGP-pwoY5kdI|p!4JVMc5wgWZ$NMCxO*1i; zd_MW>RZ*vYqkRV1R)jd%gp6yNVJyNOsmWJWi(^)^JYo!OWt7u(7Zaxio@BeOb*r{$ z2?0Q^DnF6ifmrJhfq%IDcjbKAs#RRz7BZs^LUNbiRAJ3gxz-Tq4r(7Fm}EqRPY&o~ z*77PcT@YZAb5;m~MRt{DX;)X^-=?S+zT3bim)4-@ZBU6O|;t~lDl(os5KrQ zucqsJu+$Dfq{7s{ogHVROO!3?M1efoL%{fl&MdJ=p+ABk2}j9`3jBr{3)%81&`S^Gm})6M$sALP3vv)e%)gpe05MOE77 zev~tH#hQ(glG!=w5nAn#-kj7$TiYa}Smb6Fnr<#OK$)oGUM8;d4;FDT-$||9#KceSI<#)l`6<^vTJBdma0+wdLXULQYde1y4I_Vs`p6soQ6jeX2 zFi6!Z?K;2_JGdFL|tP%$IFER+(Kn-+3b(wLFVK^t(KWBj)i@US-8}ow`S}_|Jb)0~FSu<$dlG(;Xn&K-{-Xl5wW`eXNa)x;#@{BBe`h z$lB+TtdM~`&%e`ahrES0`;0+T!;0!&j5q!{ymRnWQivR`%U3@md zK3_gl1h0>g94BSnO2G^{C~BZ-z)+$keQr$~+p@-eo0{|YIC<7Y5AiQ&EAxk2QJ0|J zrlI#}RbRF)=gCqjgSjnC&ayCohG!^miv9WoG%T`SK^ck&YAk?h6RWbZN*O3R(@T_A zeEDP)Y_;vOrp0kOmdQnhHHEbRab$>AEKOmb(K@FL{kG2H?-6EHwt&&+M~r3 z!vy_0Ra4N`$p0`d1{!VzUNPBt%@fNGs5<-TDjNlO8B2t9CACoSB*ml@ zZ-I~EPyikM@YwEx+WJLBaF!5IUFBpj+IcFJWf8ZShhdej4%WS%DA@q#H=432ek(g< z%3X3w^pz&0oLztxmbiVOD-bB@Oxi%Rvn>g-&V~@PD?`*}_4lR-2*DCxCR|(s>mnUV zu>3A%tvtD?jOzLJvuIxv$KeBySjk9QPe;))w#+1GZbnFv-4s{es!r| zp^MqQsKnVYLhlzh)}o+L7TU&i6W7Ye9EG)>QEj;IhJ}f=86*wflkIvU8?;k#ME}{$ zEgDRjZtzb8)I?>9cvo&hz(IB?(5Urq^e=ZFz)bhn<#}na_vx!eoVtgRSqS*_0~B+t zN)}pW0#z8b&q`0Kjve#EKNIvJ9t|XnSu-RuD`eG9(lAqkpa-jLab2O!a*c&m&q5)v zouD*aMxy`vfeU*ckS1kxAtt}+qOaeKwdUrqJ6ku|FIw`{Y&TweQkq35*z0`=c5XgqvsbQjSE3_>5Ej{&sDc!}U^`=SP^^Qs@cl2%mnJsq$)d+cq4($f+eNiMs7WP5!|XUA)shu{W|bmW0laYVGTEVNT_o& z!%iT;Q`8Ed+$E7fDnX4G6ljuu0f_|Z@!+O(0OX1CXz;sco6Z#$T6JyJp|So^+g7Bu z$}GhGWl&!>YGRzyFQepK)F0m=jE{VQQ}cQ#s1ywJdY|d`h51s1pBLu@iov^ooLk#f z#OxE_*pUURHudgVK)GfX@Ewcd!oZGuS*GHUmK;D zI6s|`&UkB<2iCMiTcNx3jS2{E9dV|vdJG( z7K66}=I4Q(Nh&06De+_UTSJ#dp{XF)g63e!jVvUUGs61}A((F?V8OIircnvGbsO88 zw2yz!!M``=R7^1UfdML8SUhVYKX(%Ro_n&*OwU2&Kn|E^LKLPPE>M(Ri}akUdIf^F zBcxznQHaUg+!NeqHD@5&wdl!V64F};pr2;%?z{RKN6>)O703~qPTYaFX$T>2Aw(*1 zh3|k6F(YFcki8VY*1 z5)PUxL$)GW35_kfIf8Zn7Otfm^L&qWdyAu_cxz9rIl0D z=Mt03aVi&L5GC@tP$GZ6%0AQA&n*CZt~g@kihC;j1uN7tk zwJvx8e=Z(nmq9>oxrFUqWT83%tMwMXnr!bMB2|^?(6sdij<4dy>u9r;4G#gbE|GgThm*+u)T-;<-;mG zyWltX+>Y0MbuN#;LyyqFM$3L6(C-rUKnGrlxZUFKgEy!er}LEzX-|rhc0$_EzY!F* zA4kQ}a((LhH!ll+(iqoMGMga7x*tC6!Nk6U%FNYU*yS<88tGSiNB+w7p=5s3kwRs| z->nQS*T#f?J828>xn4ohY~9``A^@bNAVnQ|F$={A)uj^zJr|?8Q8!HvaThUEn8ypf^~+=XO;fBbS`d@v7GmeSG?iPQpIdXefIt!;gFn0zN|SB z!MZS+KrleoK9zG3ZkXe_H--fO8iK{(FnG1>j8bcF6*g?+?V~ehW?1`OBZiTVt3(o4 znSQb+Q}J!rxp{XZlaj_&Ja2(g?dsL0|LGND8nlXQT=2RkU(M|V5tt+8q-1P$%==|F zj)%4i9ip?BVIoH05SW02ad`!uzWih>OY7&Lv^gg}?Of!k9@(lYZ!?a!J1^#a!-V$Y zNg}q&R*46dN)6lxWRB6P7lu#5=NFDa1PZJ!NOG$<#DeMY{cJ5+fU;#$#lX%WtpB^G zCcn`nbJ}bUG-2>}E(CVeB;gnucOfmn3*Ev7FwH1);gMu+#Q#hPG^5(3Ci znw(qD4@4}5?sN`ZFJQ_WJf2-J)Rs_WYSkvsyIn~kGdzjtg0hb5W!CF7ewL~V!tM~g zQ0OA-+6Cs@5}g~sU%PJe-K#Q1eSAko4(czaE5>H`UEvH*emdiV&fy^si=yfn6t+|P z8ujXL9ih&oy*GMRS;m273_f9A%-Ep=j7eHtN65#jkqd!Q36+4^8Qu!lE{Dtp#!aD&4 z1YpfM+<40ZK?G0GH%6DyTG5i54oCp01$Ysw_6|3uz3?Pl`pY_~=5QS*^D9f9;S*pl z7JC+wgAI|L$^$2c_sx~$2-?`WS5AsEQGqb4bl%5MUBs;6E@rJJK3eksGc6V-ms^6fmldUY9`MmN)MP- zU@&sl%>pmXJ?=t{Ji;M^PkN=PO__YM)f%V}gY-!R7OcP;YO$9v2!c}VO5xZLp@AGb z^BauFo9$=+=L&YVs0Y}eAOt=Fnx80))a$y=^?M3m2W{V}G!oT?g$5Xd)$X6`ySwfQ z|A;8t61F_yG!J)!__XI-8!2F^-vATP!bd!4Uuuonh4oh;$h&M81D@v^n{?jzQwv>d zZ2D)Y6li7en~~TJ^5=yiKyw)1s2cj#Q_I~o&Ns$33(d{i`vR*6mC^!KDn5AN1yTfZ zYK92B{7z&$z;Cv^%fTP2RCT|uBrs`(1h{SmM!nc0<|S9jeK&U|Ul@iX11B+2NzF8g zAyZ4A1pA-=BF$(km}6P%4k{e^z}#|}bngYxGx%N-Q`-Qx5`h`u1Y4Oi`}aO!?-lA5 zdYO8gCX5>SdD-5|Sq#eyY6X(>c*rSDgZh5_9$DE+-NGnLErBSq)#hTVZf%J+qygm4 zJofT;Ba2+Pk+l+c4u1+g31!Kl{P>r6OGP#c036B+L+8+2PGRMoRqCF%ndLM$?1k)O z09s(ruzbEKWJ?JNuUuhGC*VAnw6s0!Wn7tOBfGH5?;l@j^-HZ`+djM&sUhgPo#XYn zCybor8S}iw6;}+0)XLmm)GA>)JxK{%ri}52i`rlhIQo@+M&tsid4`tzePRO9lzS+1 zb5_PaxZgjams{T=l(454VYzN-77)B2E67wnx7Tvo^?TVI`Mi~0M8PG6MIj_dqcbW0 zRv9tJVX7_w0b7pBz&-;R?zaP1j$^$K4S*vs|2FiN{+?)xx2yzEkh;&`fh$N)p!D0E z^Y)&B@>3zs8m_GbD2)>1WHn8bTrsY{XFDM8gOQ1d800KR6^l;U4UT|w{fi&nJsn+P z-NH)Foc&~2BjDxPvoR?f;Bz&k-P>>1_5%BC-P&3vl!iIE1+GvV%1DZHjOkVSZiAKE z`=9@9r7SILwGr{NySHp@Cr~VgdW+by_IMR-5y9WXWd*2>=n=^0tNC11Oc$BekLtK& z*!l@!D`HO!xWJ#G8{!Or1<6n&;5LCXNO@}h_?4mB6nCqpFYxy+Kt^C{IrKx*LNHyc zNvJ~B$9ZlT%FKMG>H@V{l8|8q6~<*v0h+BcYX7Y-J_|v&f<0Hgtl4LKmcS8r&r9JAptt1%kua?w zHRmOOvv6ToX9F1>yQc*Gm1}Xb0Hc6h$>b7%3AEw9LM$qY%j&=9R_qTvLg76TxGN$J zrzV=eKOST{9Pn{qQmX~DOv4blG$3pFL)hFr_D_okN8G~wv+ge#IQ>tR%nryYi@(z6 z{PNU4KDs|Zq?IeXxa=7H%Ggt3m!uUY{v0A-Am6Il5{y|!V}r*N?=ykJNGS@PrpgF{ z>5eH8s;5|TmJKF1xxP<6HwwBCEKr?e0n4MR#%ejHs)GM(!Cim2c*O$m58L{ND=e|K z_NWl~us{^rhri-XaRFD0j!IC00)^dWDbFngoKzaZ_G`Itt^lM-5p6{HNi5pxYts~v z!Jn+_nD|TeRaQc&_bl;1n6t5qaT+-h3TJclFA){nOF-Nta%B!`RzWJbeq+iB;suOB z3gIVnmI2x*BW8MB^UOZ;sfgqj$cms6V6!((%{^5Ky@&QoC1)B9x8I1aY@eGAo;kAJ zHJxvkVsk=$N6z?(W0^O8J zA5rn2BOsEESA=pD;oPA%X%fBWa6K`6zzg5G1b@qSf`nZ<6>4fbP!o5Yc)Pfz-7+Ew zUj@YlFT_Z$#YVFu#Q_%ley{%^Lc9y&P&;RMsl0!P9JcBG6Xkdi+#Eh{_b>HNxN5@qtSIMnq3KyTaJBxc~V-#66%cL6evx=-c~4zt81<^sNQU zb7((^BOAs6pk8`=b}~s47qNZ;@gz(xh%#_&zaJv z#iai}_{qqYQvs~`dqry3zky{Voo`uj!^D67Tg-~fMLu)Cx0Z9@kV>M&6p~0-rMM>N zZ@B?fOK@0*`^OZfxMd}&W&Zv^IsJAj2{$BNa7t6lg7?V+1-^2xaemB9-t_+1rG;!0 zL0E%mMMBqkd@fQ+o+0yorf!2PGW7+2)|E8R7mQ4PKLkvNA=B&Qt)Mg6XH{2bjzgmY z+!n7~(QOw1@D?_o_h3?H+9`Y+piu_rlSyL=TM1QO2+=AA70dp6i%TXzl(xQ=GxKp) zb$eudKYiO_rSaGDh!zuBJ}c@-(*syr0#!-9zG+Ye8v=Dcqiu zB|4VfE{*#wAVktN=!~W$>D%G2$(FZ7yX<1-S*MB2$CqV-g;xyZRPK{(f%g;p zSr>0@@39N?K-@N%sgt3CAZHQ+dp6mEq{lQS4p{6aC&r)>$jJ-T^Q~6 z{ZYkag@u&&A$Ihc066bifZH3Q_CAai{)#}dWbldbvHdK0DH21mnuRmNFh%Q6sJ&CgA!WOud zfy(8CQ>c3ubT6^^*usDQ->f<6tWp^5$%6Pv<$Y@i(&)R-YCEi{H=iLS;*9PEzZIFEwrm=!a18V!L+)#_kzpLu@Euu_$;a=?tVb-?+PNg$Y+)IQA_YC?L1BO#V zLbzOiPmQ+7+gT)418Mu^96Fs4D}aE1J0zCV+e?PoUQKh&vjEBn?Fa>~D`8MhdCLdC ziLXu#Wc1Voo!Y)V9n)BT>DC#;T3ok*N*iA=5ITydF{apt7KyPN0SlJ4S)OU92DZ~hD%V>zsuOzb!% z4!e!|7D=3uH|*!WLm~J3X3RiHJzMnv$*a>XO`X;_ODM0~W2BC^d~YaQtS^foOJCp? zun7vn&pv9VWWV5VQ8#`e%sf9=(FXr6)%1HQ|6RZxzT1q7>dr;d0MeC`Pn6{#quuSK zHh$l_Z+PfI^rvZ`Ug&Gk%y6k1Xy0rzUQL`hyfJ?0r2y<5Q6I6s6MnX;MYFe*-d(o2 zaEe{EPw8pyaIx|d952W{z5%OWkZ0i_5dw>WHkmIiqH>;A&2D%QJ<7_2>|wpNg-c_W z>M}B194yZ+&G;H06n=*%EIUvFU_WELU--(3iHW{$*jT3WbAPf~tV2Zo1|%oXrB-@( z8BJ*Ni#{=HnFh>&Lc|bPezYEOVA4CN6T*9vfnMq{#o1dEd3LEWxj$_lICTyc4I4E; z9wyUbszZt867VLeOChQs-fU!AoW&<`kur>|ou^QC0G%|prnB&8zq7T8wjpsAck_^E z4mu*qMkS0=CIsC!JX_kOfv8)!M)}1=X9k?KyxW`yTMsW~s9~{-7vK^mW=StjEtfX- zKmP?XR{CXG3CTsrgnL~;(tLZEUagXddw{L=@h_A<9po7jx17qamfh!qB>a4f47Ql; zS&~6C_%YI6%-_1E`K23Y15=oGPJfMZ%~|YOKVFF#DsgSWl%pWMwIq@VPI28K07ToP z_ClVHkHa8D3cuAR^47|7(P3OJ#(?`*`*ZsqfIv{ULdMT-X(A2E`xua&)*x+7ic_#$ z)_q=<2HoGn+a`P#sVubB$({gq^*EI5pWd=%jj9Ibe5RdcZ(D1gOOseebq^dsmYMjw zu7&P%C0z)X+e`ME$QkWE|D(a+hZ&B!J%ErYCK0#<5tFr68P=A3i)o)XG(LV0bW6AQ z_%>lwhB87XZ493+XIL5qZ8DM$DjJvS+(4&R>CeMlaKd6>u9#V8 z9Mcwcdw;@cVLxvzia)ki%f1@*A}EvDs%5aXeu8>YoMkhpZzVV+H~SCh1<~$+n}<%a z#>_J*Osw3xuDVSwazJ>k!4XLP48wmLaSN;sc7SmEFD~m!LXi97X-MQ{Dp;eR%@^)t@S;)1wjbvszKB$#J_(;r~(uLD4PBbNgYm;wfYxWbMKq7 zU--DXz^k^{Nyuth22LIys4HQUP$b@qTv@KlPd>_uumvK|M=6`Z5@ns0bKFQ;D zExM;M5O%Uf+y(0nd9uSj%q`t}DQT1tXX|UAv_P>^-IWfhqJ>7fzo;po+S zyM^TkVY0+5cBV-#@2*C)XU`8f(INn}I(X>PgSPhRwD~8Y`{rv~d(UK~>?_ij+7q*q;uskaw>}gmP(iJ(<>^*Pw$Gw@4@V!sPa! zu}>8<_gWkS2(e4JC$yPck653$xPMgV&2-Twb ztxzSHVVv6T)#-Y`VyyZ5L}36`hoq{AWB-LRu$85`N2VE6ctWs~+Zf>{d~GQEa-GSh z%z7It-F=(IL@PJGz^NtU)(v5}&U@cQuT@Gat|~R*cbWhEH@EmRhMr6w`^Nzbe}BUW zbW$ssTVV1F#dL0weEm3mWm4!X1i-SdAEP+<|7rSrV@ZRh2NT1+G6e`U00xi|DUw=x zz)~sg>%q1Mdpux!u&)VaLPjvCr$!YbMFi;pg`Q8Hn3yxCyMB-lkXcX?;YV1ty;HqE zLqK`HQEjx1Df4-r&9nEjnBM0WXfo*-(xA4dFC9^EI@rjnd#1c?2m;47pAi!Hd6g|% zzhV@DD6CnoHx1MRx9<(LDvw7B@l}0du^+ny%uw5fG+XS}dSIdv1E@kHJM* zbhLtYs%+9fKo6QBMX$nT&ej#q+fMIK5#4`v^e5|v1*^UV{l<=#-V|5FJhO*k0^^r^ zd3!!yp zU$#@YMcN<;tt^cn6EQq$+nNn5m$Ukx-!iyA?yLO&-(tnZZ|zY`)3tp zirA8obFu6sLmsCb)Pnh+o(@7UKI4L}+IHW@1$x`hpQ0@!kPF$KC&-?cHN9n0*}jAj zB|m;;f^o%rwB*)??qzbeZ#ku}ddm8VdOQ1<_n4-I{d;r1A_4rG0)hPJ|FOMv;Ry?rX{Y*1Bd$1L+we0W^jk;i@-V$MA~Zc^E&d5O#U%ju3`M&WEg1ot zehE>p4=Pto`ulj>KT)mUo={2IbS!R6~p&qFmni)WZvTxf8&RAMcuQJlC6|(5eUEq zIE83y&(hE@$YAZWbi{DAXXs}cH)4d+lGYS{BPE9j&iuM#S9bf10vSfpuJcErvYjaJ zXJwzM+sQ9_R0Bp_%Jn|(w|<@F`6t_U$qE2i=nG8|i{N-3CxAxcD1Kfl?pyo~^@5gD zp+9RRm5^FD9MZ&60gB!p&1Vs4Q%TOYEtCxeSQM5%J+(-`(uP%wFyw&!@CJ1;kOpoKU)>WXoF!>8gRZOU1~I4LFyo zN?{p~8z?CgljY;rQ5I=a->w+wM$*X7XeHj&;=mPgPvgv`i;$hvA_= zp{Vw0G^1+dg*iE!%w%T?lwA}f7s*bmRDAgC@K=k?5abaKjAye5il~!9IYN0p$tn*z z4*0%o#5Bmh0W!6_PKuV6Fr^gE+G0c|uO9wAU?!D~=MRTaPRgvoiw8F(aEk+<&FkNY zxag1-zJN4YF~l3f;5Ho~gAhTez`8`rqC7LFNpC1yi#no5Zk^vb} zQb(wTXT87A(_bLA6_6)P(q3q+vEL1CZD0v>@tOQsFPn zr8oWzxpcLQd-lK9slIx*AD|P!uR%J9>R{cQjG3#Hg`|+{y-X>n5=hdpmdGF8%Kl%7 zl$KYvPh}DOarex78w|USj)c+U1V_wv0{5u`Yfucj46*5b*1?TVz$5Dp?5c}43)jPf zZX|qbodnkJ1#`N^kr{xp%J--CeySW*GJwfi2}TKa;a!T`fPmdoxWZ@wLO*3@9oGJ+ z2Iu)kD(f2s2D)OIpC9!;8&B$IaibT1SLyu3YlJi%OyS~zxJF(tEw6tU34jI{0kmvlEAGdv4+FeXi`|38yuYF{@D^RpWnU1?FUQsauV88W-b4# z{W>Dc0*Ce7t|C~*k@tJCUBxyJMvAdC+d48DfWKcwgDU)rixcPnT z2+K;Gbx^#!&t41if_2v(-lieIRy+w(EQ(oRZ?OH-Yeb0b>CZVrL50UhXIG^NNaUAQ zc|b07#GIc9MXWA-4a<3o=O=pzpg7%LJb7)b+C!B$GPqfd(>eB+OemIa6O8>=tXq|3 zXoch{(V|2Qmg#3(M|+$gyHG;J9j@OP!Aw$8@S=r8rbF~*DT^$(ANbSs3MxS0t%Bxm zGS~z{_l6`xHwQT|u-O{&5We@oIZrA;WGyJ<-nC2NT0-oaUp|{`bUF0d? z90lawdYG3CBv@jg3y=`FeD|Z@B7u7v0e)=SdmkfQRihhREAdmg8$*;X-0WYw^P)%^ z0O;Io=LT44mKoB!r5b4HVrsDvQ^BfJ$8OZNSx7o$GaRrYa&g$R)zqeFL?0Bj;Y?@# zO%Jv1Y$>h=JAAWAM0$c#ai_pX4ZcWeKf#)V&DgQAV^-GrE!IQq2>*2u)Yjl)xLjqk z+*H=$DZwx4wtjX0(KLBLs@@*wVBsP!f@H@l5@j@lc86YFti_Ev@8a4+qX@WgURO5L zDF;((qba0)gy!MOWPHrvK?F*M0UA9MA+@h7*!?9aewFQ=155BCj6iNL_Vu~e1Z$;1 zNw}~V6MG{>A2Ug+!YbU=^sO$Sw#00CuwoOrV~oNd3;A-shr0!W@NT?DaW_?sQw!|e zLah=m2ya`d$U5Fi*e5j2ZhTplCAJV;cvH|4)|I~Z9{s33Lv5ZVo~T$CIR~EmaY`l@ z_}6v?_wf8F`m^h{T5rRIl^mJisqR9)ElsRBH$p!zYe;T?4Qc#1-1kNL7Z;B$@)~mhQ8WatRwIHDaLaitWRzwgBBKy2&uM< zL8y9iigc?mdt8C1;qM%(=E_>&FYPJZ;I2$J zl&}ud_e#R(?i-lv>v-`JdR&n#N`_wxGtSrkTm z+v<2e1DOmsB@JE98F9*S;Rl!#CPDLC0xeGaXPoy{SX00cw%lHsl@Yn>uVeQhpiN z&;%^WNe}nm2y~>8t*tPnHH=!O>J?-#0X1fu326rHIkaN5gI`|l@A;9h_Yg7J`BngG z6-3IS{X?|J*hEE}GLfpM)9-^@D}cZ8+WGx=vBlhWF`K_v)|P?FvW68V;g(*B;?VN` zy{K@{-!uB0Di@*`I_q;&q_;vx_(#2N)38wYlM%wC?ulIiWgt@_z0a+RzSXO@wL}OL zVCvhyrYgz00DZoV+FBnuwCu3Vgz|s>cPIR@pU842^d5)Q;I~)Z_g4YnNBGm7_Fz2{c~8!>)?*?{5#=#4n~Mdk~`?X77O ziHy##stjTG=Ve)g{+Zr?V!+UhH_roWy2@MSH1cH3cOko$}Rr3~0;yJFJ)FL#d7 zV;H9NSH05xy#0zGZ|xac;6519a2Zv2e+LrXame^Aw=p2oTN{Q@dNB1%uRl%%PVu)d z0}2S=5=Z5IyxLIU?J|>8Zb2Awd-=R4nMZp+@7V-JY+-BCvc9#SJs0DoLKx?s1hGiI->nD})i815*x;7UL&dw&M( zJS1)twy-R%i|=7JlE$GQj$-Cl>}@k347Rak07Q#^RT>aTm~wBuHIMbPuK`qGC4!J8 zL)Z32B>K1V5iKG!PGipR&j4nM+mkZ{dk11HRvgq??WtfTDpqzD{EL-=@zkEK3{?8Wum~J`_H{lxB>~t?`Kl=qGX0sams~ofwBen^h#l1)&f66 z*9wS$jq6I?f-BB{ZIHV`rBWmBPPB4)!Qp)g?Ss@N5vXiZFwtbayJhYH@(b0XE5B-Q z#b?lUV?&iNpailOM@Q@eklp=yC#+?Saf+_D(wJhscYqRd5ZV2gJo4&8 zl2o&C227}566((tr3lU@c*A9wMU`vY^#iWC)<^p+$W6rv812N?#^B-r7jzD;Mx=w< zy=)VB!dPsfILg;&Ci=NJ%9swfE5dE{_s9GUY2?uTJI*=M^uEMxsG@Mg*7R#Z+bY%w z^Zupv0O{XNUdyBwEC$+Ne#J?7x&6d+2fIA-ClMI-b`{(z;Z9+0#ZYHeU-!l;db^uh zC;NcxgnocvK~#3jU?|KZcV|gFfLfc`?XDQw4M=7&8gutz0ej%`sRNUeL0V%ABA*kq zCG*S*#{CPkP=n@&ype3YTCnXgzbDA}TvMv4R ze_se(%wZYZ3ozV*X>B1Ep(Jqa8SOLu>ernQ_@A*sqbFMu*3wU5t(1BRQdhNLZ4KuG zC&sb1EII*+QPjEGD#!A3UGhj&vTcOIqK3*U>f!t>QiZ<|0T?a&E5Lk~EBL2~pjztf zgrU_F?eo`&B`jq(7SinLH*Kvz-q!qnAR03zD?jl`gqhD;bJC-fyfig#x915HXbKKx zC0j0rhWwf3K9^)*M#3I1nA#4pRtUHpZD9|ltwgby(AZ}>nzA<9zds5blv30F{*B1D z^|s7pj(KvypjH~-d_SA5XpU>z2>>wtvchdF&~HHu5_;PyL%+Ppiw?lFZ^1Y&Tbmx# z(2Ydc&#Si9$J>(%dSm_T*#%5+q#>;p8BIgEPjXKPX2F_T)wgvOu(lWZMRS0Z;|JSu z_Lu=#6Vy<99B6BTG>DqIok9OAx0u9?dZAm}4(lFlFa8U{T;xynHF%MQmqHa> zfmQpuM|uKsiWZ8YvX*;{1;#m=INY)0$CVn4gf{LD!927*6k6BT3a`5g}DuigE()@PwVOGlXtap3An zwDsnx$FbiSPBl;ohNLL`-o|8^)lh${%ImeUa2h_2-u|usS0bD!WT=4Xc@QEtuQ*eD zo14Y#$p=$haf~FB_-(Qr{x3fQ2QZ#-SA&*fecXn}m zD@qE{q^}5@DzLqe*#d^8oyfS$H(Hbyg11G+K-PIl=5A)+CMTGLpCrpJ`gzLbQLK3Xyx4_8;MvodC1^Xz4|?UI zkh~l05M{$}5A@^!^zU6=&z7-o<6}xU*(@BE6kCNGmnRkEy{p>5HfF_!_;m?k@p6FA zdI+w(zq7QdEc8ZU-kA-JP$`rEGz>F>9|AN2vS(f*y?Twlk966>RoH5V>| zcu<%c>Q4vwm;G0<6ehd;(=3WUi*-TX^!PW@HPR02(tWKnplxN(c<#2z+1&-F8S*lb3%tWD?S@WpFx zjuG|dH;OQvmG|%NlQ+fI1=QE;$v2h2UIJkm;BIDcwBXh<*iO7TN+y0PdOiC7S#?~> z1YlA#Cg!(}mFzc~rH`-?vO*;IIo!6$|NQR*aoW1ipzN9=m+YIWzeE4}y@cLcMU#Hd zu-A`G?#UG-;AdwO`}51u-nMf_wczwWCBFr7f7IkHvzMSoTLG+F!2LwHC-W@!**ycc z0k;FhIY_{oE%xl^eZJ}Yq++dVw9>jArSU`jMtFSdmnEY>{y1elCI@?pY8Z zQv>TCKc*(Fb4sSCy0Vt5Nm00+l40>jnXKQr%6JXYUvW%Gm zrj~T?mOJrhW$4TFhAON!ZMt!pC&X=YaVAU!9CVmknpy(l1<}A;b<_SWX4jV8ddqE# zHiZb0DQ~Sr!02yqT$>;GBlpOYhi@$#D{M_;`fan4a76+D@iyb14GI&fg@9G!gTyFW z3o#fHAc}a&AU{3+gUPj`>E?pWJq$+xlp3HmEWYZc;0WOmeJiWKx#%lYfbD>n1uZO? z)*1iWR)xeznP1+t6aSY8T``_FTxlq+gsaN;j}hF>#<@x8>GDY_!&`RIc`On)8Vg2= z3BxgLG&>BocQ3dD@K;zGj99wB`jU9pPX0>$#2}o5t;cG!vsouGHg)FaH?gXADe(I2 z=hh`G4@a5ENRnBcnGs4aO5750>C5iIbVrRHW=xZPd~*C$3t5fQQ4B|SbFx8iu&fnT z(y~;5`cTU-$n07NC2)49eS8tO|N8%X)Un-4v@%XNm1h$wi``+>*6uFyS_}rNVBWEV zpj|7V|GHTNWBv_GD_(VhZzYWX7)!WV+$ z&jOnTOO3yTfzj@f<6$-b`Y~yYx6QCXchv{0gqooY^&y>+_b$^n_7yblNqP$5*IBF86SA}1k^ zZL&L$$R%#V9YMZ}30d>@DAJORCPQ2>j@tV#i^c*!`$0=b{> zOfOR{l>0ybkL#+*CotWvHG?t?ioA~l28>|rv(Qvr_LJRafQaByZ(Bh~800O@apbh+ zyELQS1pRL`dR-U05 zfi;*O_4;i58*_s#)K#ra!{=7$+Q+(W`vlmtV3c2L5a^Vy;I28d)cO0ff%-mr%XEdw=hcq^a%)LhTq6InO)#pirUK&o$INkl z9YYQkq%+j^kYJ#C`x`0(_4Y!2YolOG0Z6!qeDw@|G>mXXpzjl8J}*JKXFr?g_8iAQ z74m!GAsYEyK>d<~25w=+_AO@F|Gx1pIA`SFxrsBVG}dN;4z@I(SNc0A*uLf1sV%txoch1Xiy`nVZltugAr2?K?>TA^MV= zzZzU11@rp4S0k1&r)YQ|C5z{kw|?-@B4(;5tV2I20+RK6&ACK>B4Jr>ny)+ci+|^r z9Ax`q@Ygif0B(~X>7(y!#KcTxS)hU=a*emh5aJFxw7Rv4fNm9M-I!{$Lb`nS47$x7*zitl#<4%0EwF(bX^)6%=~*2ZiWkOG{E#7S z<}dcYmWI+BDh%X>#VXfeywei_Rakd@1*0t7B7xNyTI!dSYKf~U7ln7TGX0`GTL?N@ z9s}muf!?X9a}HF&A*-SY75wS)H@fs5+|sHNJYiBrdwnKOqnH3h4`mrQ%Gt{@kBRV= z@ino3Wme%^2hbUpLW+w`VUitUa>bZ(Zz0*&6txcX6!fz0SDP2hs)eouSQ7W*{gB>o z9>77G931e|v58!42jxTL`|={oVH(K@7%1;4brOgwJOt<7Yka-j)M~fVsDs}^>;L?( zTJ~G*#Xk3}Q9q2LU%V)Lq7lgrX)WP%{WlCVr(d)KY3a%q;?A$AwU5nOa!v@=-)HzO zYH`-Z!u(9e?M1Iv_8FO=dnza_0Rsd;x3i!3tW&i{QsQ#Ug7TiIv!D%=3~=5`pt{d; zsJi=15xAhZKVb1ISHb)s`nSWL9+pwB;6lJD7e@$y8uT(~EM!=IOs^Qw^{S#Je!tQj zgTC(zC1LV6HptL@}mk+@JzO)p5Iu0q&^004=!H7AB_0m)g{Mvb>%#i;T| zO5-{rKVOUJkIEDhY;6&idpA=4{nK+C3OV1lBg*C(f}7rgw>UhPGJiJ(qZ<)c-1W_w zgJrEN+Fp<$vaZUCRz|8pVQa;ta0!$d&WB${3~K4t=VMcALFSZuiJxhAU?FfjWrn=^ z-El#^Mf`XubmiLSEmH$!zkh$1ob=DQjp{7{Lm%!|q^b9_&{9m0MLBIS8m70j3f>1u zO*+nxwA4uq@K$O64QRx4X#*kwNFzBOsa{!x`#=91?vs?W6(K|@ zgYQWq5V}y9C}~ES(v>dvZ;G~(wB;2FF^K3&oE-Q3tf`Qn!J;gN<1+;x4j{zmif|NI z8l_ANu4;&H@csl(wxbO!Ag3_nFrSB2ckk^pS;HrxAfM2=9aeLdBA%Mu2nB_Y-J9(M&(BYhY@7Ozr!m5hf0Lsc#-(Y+Gp}u z=Cm$Omc)B_)w|zoae7OEQ4=t=;^rBj)a@)7&e1Dw0ZES#k)e&8wI2*~kS(l2+RtJ- z^_E4=u}O#2PGQ?AGCclbWEPS7#oRVv5f&6Bd*IQaGP>5Xb-`q}XhLSFSzwTnQ{OXHi+yq% z;$I`EfH4lRy}{^UUDEg5N(8{dmZnStWn?>BQ6OE?6`y;?a#@ zob6DWMv)$h94bPZyK#qh97%=w-b6C9kchtPu-ua^CJC;PY=B3kIb`E&f{mXFq|N)_ zRz*kYHBhJ^2y+k2s2b7RE`y+lk zj0(@+Rbk2;D9y#ci7H_G=|&C@R=D=W;F{XMSl%k@&ZhVH2BiBWv?pN&&92*^pk z-9S9OFV;fwFSyu!JQ-G_;%M(v)tDsPo9IUQ)YJj&$zW;s z#SO-Ivn)z<4SRizS;;asPuvoatD~WjKqeHic{xO-62oNne3*Db0zZR|18Kno18J{P zDglJ^Q`|mAHoAXR5lM;UHI;0cvv_91H=?6^||HUIu?_mYML$@<%7TZ z)lv~fR{3Em#9cih7)??k?qp3V#CE9_5;Pdof!DTOtRnD6$thsP>bsrnY3()C{yd6V zLWUqNC*U2($Cliuzp!xxN9ISKhyoEu7aP#gV|sItRFpYxJYe)~P5@xn!l@#9)gTJ1+i)6gj1_f#ts_%N@f}k6$MIS=*$D~q+ z93ggLt3nM`het!IXN#u))fn|fyqm^`oNVBFHsxq49iZB5y+K@s$?vf5(IxZfN_P{( zvA0BE504$3=QscfdeOlS)}@75_}k)%0f;e<-n|HOaB;y5b4j6UDNY{aTbs)!4N6J4 zFlnwOZW)aJiE;;_Wf@hQxT)W7jfga@9ngSvOWP!08vd(50}?!BPN!|m3MLScFp)gk z9IFW9F87}P><~|>ja%VY2?PHn(Q3<(zCo{F;)ybQ$iB*tU)`wiq?kR#xE`Z0Yu5QU(OEY1{#6=gSG`t&Pk#Rj20Q z(x|Qbm(AmN6`T0uN#`H50(znib$H2H1)J89xiX~3nWgfpsXMeHReQVRr;w7c@1pQe zi!E696prWMAwGj}Pw=ce;Jk{^aJ|?9;j&AH7UM@1aeVcG0CU%Rj+_5gGjul1-~M9V zK&MenY`SqF;(!$3xJG|#QBZCmAi*l>zhz@th>=3RsS^}WsmRINtl}tDd%OPUW^FUe z4VN&OSNuOi0t_ou2LD5#0tK@%R5*N<<_%4uHkr|%D;GJk zb0|vOo(HNxUsW(g>R_gEB~tR~d{+q@y%{oEE-`js_vW}Jg(m%bnqr;Z!IJBJXlj@4 zxPYqm3*u{3t-Xm1)Q*UU=Tn5kW+9v>6#|1`>8brA_speR4pF;M-bFQjiulKJR9n@l zgZ{#x2?$@E1Ib+G-#1wi>lSKWte^sjB6?~@gd}?TMdgpLBp@CGC1>;O;tONJf-ok8 zdIxJO%K6FVPmPBWtK(o*WDPDBe;w|7F2n<(=>l!g7vft>caYgP6*jxFszO1b4@AQq z-JP5~$lf*fW&wfP2WT%<1jN0i{aPa6SfPVaQ8IB3^*p8&?=pRmald;`3xb;ok|;$& ztNjd_$|9r-ublwTLKU#NuX$MEzCc(Y;vRUfmPR^QMe4r{1cIe=JuuV2n$ITDf$v}f z;z@H4LXj=q{q8Xg`p}uSS&|Bb@z(-Z!SZ{xIBn>W(%(MCU1@qRsA?5 zwixZSPqILkt;iW(V&sO+s>O;zySoetZ<1-|jjK4s$xqd1#*;|>C-fP-vJl}T)p0bq$f zOLlKsLv&b$H87Jth-_rqF}&Q0%nDg|4CD{W+Cc&|M3AJMdv2)q1A_6f~b+xOigLQ_HZd0sE{L>0M zw2v!%bRjJKd49t6w0W{^|fBqN2Z6Vy`yk`W~g|y}TGpU35Oxl*oA4=5Ur&hM8 zNm+DOJ)rK{zMVa{w?J&UzMtvmL_<(_FHE?@=7gwH@Q^ez2j5(uGzpV-A0xgxVgU`An0~E!5;U7G{qsb($qi}L8q{+i$V|_FM9S_EeO>7Na|E+`hC9O z^Yd`NtaD@hAO8=S5`swJRR~wMO!6L+K;3%rS@CTRgR@&)?ekXjc4Q){xUv$NV2LQ_ z%Gl3vo5&&+Ejy8sWxtIj811si}RU&D{^gZ zI-~ym-R*Vw{ISm1at|n7g$(uMh|gl5_dAgRgf)d*2+E*I#$j*D%rqFpPe%LPQZ_{W zJSN)YK9|nno=c5>U8T=;o6Bkcmq@aA8G|_{n-jqp)Cd3M#91}oGl%r26Z-q3W;I?K zUX{FJ>SR~avvRFtCh6O_^WE7?Ti|ALR4{x9ZooB&OyEoN*C{fN@!hUwZYmSEXuL&hhMIKD@QZWRP6gO z3B)I-hSz!e=Ii8|tlP$-@9@)X0LD18NIqpJ>v|N_UU&zKDr5y^k8`;k{=bA#rYp;s zQ0K-}4qP=8HE&Vz_IENQv=!~9&L=fHph=(`vs+74b=RHX%0(y1^>PWNAe5J0UF$6! z*my=O=o&hs?oFv3(pTf9gMmYc(kAw!^?M2x-eCe(0Avew=OB1q0cynuT3CmGM*Des zvtJ?4Q=4x!7&)N__47al`2n`FsALxeHh{ZS#VY{q zpau-8qbNJMOQcp#burR@3fEn-oF^pJ*ggwHwRB%OmH zlb`i}3q4( zOVrO=!td_Z)NQ>bpP#S?&47kXZmsyc;jK|^VapL}0J+{q-Kbad6L~(EN=lPP8WV{?|=a=o>OlMV)F>*ra5l>9@kF<3$PKarV2>0ZHqFCjW1q7d^LB8>Q3`mi|M z&&|KGyH`{93I6lHxo<%>`IAn5g2gtcq#IV|v$vsd9V6^@-G)(f-l9=!X>+{#wTO@@ zFVFY+R;>(@3G1y`_TSL1K<^<|CfZkh+cYcFD>-1?$)1EX=zh<8YBzYU^1O=f<7LN5 z2YIfAxCBRtDJHm`(iP*hKYoi_6A}4gmovwc4i`VI%);!ue_a;iXY%`XLJd<2eBN`l z&sr<__)5{%(iM<2_jr2Fm8`d^4T%fo_a$Mji1(^igWp7(5<3(B>sP11(?k=Rq_rf|;+lTQ${rc5A_h4PjV$KbgDVRVQ^ z3YbG%t`!2-h811mlBo1B)CT3>uu56X?=W#vjsR;(7Z#mgTgWe|l5`k_(*wy>SIeBw z<659H?T$6YwR!}sLJLL(H4Ti*#nqNc0XLQO(8sCkvtgw)%7}&id`NB?XOm-%vY?-g zh`L5^dp}0>O7-F=^QMioxqV6S{Y)X1HH4ZTB6y}2+Zy96q~UX8mR|T-R{|;~YUIzn z?-;26{IAfSgbPg}1PE)sU-bS@dG)K7dpdizM$A?cQH~Xq84W;SjDx$L zG^t6UW(&E0T)0hM7Mu=DLQ8Bd6xVO7**~9v3Vg1jNt?QHQ4tnUNDa4ipKqreKw-4xiYBhq{Y+4Lvu>i|R*rK;XL*YKK`5@U z>V<(>y;v*AqO7V=mb)5)n9Z;CFglwk&kpVx;e;#!UTSOE;P*dk-A~nK zG(!{{_{I7eHFt!gMZkyjP(M0Um+li73uu?1_NCTnvsjb55h>m&+dsTiK%9YTo$*wj+e({vdRfpS+d$Z;%S4fa;_AY!m-2hbLd zNQ`-&kQpE(HlA}Bg=sIaW#8=+n`3eJ%@{E@D)F}d zT1Ko`2ztG~*4;B5{yBbX{fY}OWSv<}xV0?7o4PE2kWzSDR5|dzLC$3@o*kfB1ka9b ze7WqWPSv(&@9^HH`{%UO{Xj}4?aCh_5f5R^QF6KT7a#*Oj22#3v_Sc1&ZQw7rpYZR zZWfo-ee_TAKx0*)m$=S4!hO@ja{m{SdHH10)*KL_v*O3x!hd*xA|B~dGp)~iX`#S1oBPF z?W*!F*M;OWdEWs1p%L!h<7={ZPspR?73i#2A*T)p&TS@1O*Wj})T{`@+G;fS&w(Qz zQRA&?0YeGF?{NG&(4-FC-eNCFE7;0}_bsLg1K+X44B1uTcqJZv9RN7|#fDX7V^X;L zckst$CooVCqk4|GraxssAg8LJx!w8N_pJUh{lTaH?SU>$zU4{}7 z!g-u($k==NOiUf4aumdmh<<^QH8*ojUdpI$I&qM#1vGw1&X_MbFLZ4!L5TCZ>ZV9C zL~Y+}zv0bFTm2Tbm$m9z3g)Mltve}eV%}^oFW@r;Fu)r$Zo@_1GapyalJdO)LZDa- z!_wt~pFbsPJovf=K?DIMAZLx`Qp;+8xCdOAUi&!u-3G7|L0|FW7Ust6%;6^|fAC;C z+7?oujcv5o<}?tr1U(-Y*YX9S6ScQVWr+Yz7(&W;Po?RyFc^nl%TrL+{Tuqi`vS(| zKJqD1`ASSb0m+?xaC}Xp_*}zGJ|N&wi;F4dbwkKPT);n~>*0&xlG8>ibqw!v_=#5K zEh7uT!?g4(f4#$qS~5>od&Fy2V`|XSJvsZ|H<#nYGKt8L2Nsc{CU^2?usBY=eit#`vy-@ddy2U@N8Xy@L65V@8#6tn@>2*AO2^&#sPy$LqX z{@tyTn}3-gI#`4@$$y{t4D|}n(tkCn;fx|zEoKHjaCHFP$^ctHq`v|yY@$UJUdR_Z zyN^5#ZMk*19XK#%%^-yHAo}iaE?z{ubDSsyL*@?!qq^RKO0J4ds1peUkt!Jvyn~_b zvkl+R2S3$=JyRh$4Fxffu(<5fK8k3nc1EGyV5o&xiZT)0Z}F@ZC;SjOV0*oPThg$j z)HVoU&CX>Et#Zu_KI~J6LeSP`!Fx&3R?*4^1|tNz*~k9SNf&Wc1)G;jlrs34wE4RF z$`&%E6}23RW5aNOgFrjjZ;_U_8YpBWsU1u_tGxbriuj&Vys=V`_ zNwhooTS0V}4g3Kt_+0TqsR@b|r_rWQI>^P8g-2PR5rQgA=aD;br}rD^q`)sKZ?Svp z6ur=B)^6lsrv%5eGX-`k0hYiU&cEp}|M|aZs>}*DR*yoQ7qxz}XG23U!I;8GD=8Z_VIEw>Zx6D(ceREqi|3wImBkYRm~!X%tiV}xhJJ>1X? z0p?e73E$(Kbw|VJty`AX`sc}gzT9J`@Z*7obAnAz$)+>_Q(SRDY}5lyFVM^aAl-3v z8~pwy_6g=_^Lsy>u6pT;+geSL<_b`Q@aR$M>7HlX`giX&EyNCqwoxpi>QtzUIzagL)6pXL5;+w(UfVVbvjADHiFRp0We%#}=3?pcP^`?;vCf6Yq4bXc!k+*)Xl!@{_* z&{CSH|9yVkrjh;JXM`=h{c}R%=Pl$XQEo4q%LP#}`jXs2xI_nQ0d!fn%reO!KzsaI z$e_syPF5!OSfOjrMNM0dD`gG2XIzL9F+kl2RbQnhy*!tp&n0Ta{4Ti^u>d0j(L7VD zxJ_i9ThOId(*TiQ%7FQ=*b2?pf_m)6epuVst;`jYK+eyas*sI6W;rN`uROB`fCwDu zPJ)EK9NxPhvt1C{#ogR8XD|J|E8(p#d#(-zKgDEW3D+oUg1}z~bfM^ixQy?I?39>S z`tp{wsaC)xdsz>enuX~MTxT)eKFEgURPV|Hay>m64$mj1X@*|P@MS4BWUvxR;$Al)px&0dh?KkIY+r_e( zJ6=@pLOSWg)(fzd>SKi%xFUI<%t&1N#Zt;aijOplHsbR4L%eh%!~#{zNRwRvty?UY z@%}S6qU6UiKr~B=_Ag94rV(<*=E_?M9J<|g3LXyQF7V|0bRXv8@kNCFo3#*l-FFC$ zZJgCFkVp}noSjXP^DVgbiVZ43o?Qia&tEDB`xiL28^90V+?P#N`+jopW1LBLl}( z!4v#gx_327O#kP9DS~mLpmr@E?6WSK@7Y1$VGi#7(czUoCs&x;nS_ zhrgy}#_hOZax0q8YW;Hr7`=UuEq`u`5e$Z$;j?hzjQe{KE#2r1P=B3z?-xNpame#k zWuY+c39Bx*vFrtTJNU=kH4GqZY4-q$ri3hmkRf4c^t__J6@UKDP}D{SD%g_2t)=r! zZ4UlTKP2Iluyox67$8Uh4$d3l>FH~8*z4b zrn@@-s#;-%X3DxCYdL37&d&+}qCKW!TE7u@272}k^Hb2>_}OQPdylv0WX=$Y5cKzX z)m8xNLR&%ILe6V6QkQK87GXvU^i+PwV=Ay2h;v>sp%^U~w-vd*$K>@F7}Pok_iy{9 zg1QQD{}zbZ9u|K=52X6- z1GhCWd&9mJOxHzOSokDaG#N}<8wG$XEI8q>UcOtaib(@G-^YI5!x!hCiI%;z+2K&= z@8|hLmJAnYpLIU_i$p%aiZ78>a;j!gLRxTjWy2}H_Ec}*fjkL?IA+V_ZRRHip z!M2g(C}rb&+6roYlG`Ss`~CbfDKTnUwh+uS?P+cesUWUY{JbdydlGTZ%YIpP;paUI zxmJYh7Oa>!xCP!c%z8^kXO(7 zUoZy?)s6f?M9z9LMvc;v}LV%Cahlry$gl{=I6B) zFM*zEy3KHP{)&sDUdqsvUOlfk&-+h!Z*ZBqXH-850V;y$S>4C~Y|))MS4`QZw_s|( zgI88>u=UD;K{k#5np%4iI?P0R@b5&qg zakD7@Ost-S>~HO7p-peOhFIym0P`!VImx1}dV%89e7{8HiuU1q<^)`*)`TwDFbM2p zhtYNtF`Ujm%lR!B7e`Yy<%CjxrC2YhG}h(HcIL_ad9~;EV$S|=|KAX%W0(Y~Wwb=S zC6l4!3hfVG_NL&%paLyKr!Qo>!XWI_CGfj#3Peb!F4{E7Xl0yrB?P9x{uTlP@Uk(_ z=st;%6@Qam>k4TLQ~$NI37Rl@P1L*WVZ{Vf@cz!P@s%V4NSbTjxEm3%FcvNXlxliz+3WI5Si^gF4ft!7H~kcpVj&-k^}w(L681myU3e za%Xw~InzK@L==nxQPTflZ&tmW)&_zoG9lZ|-|5v!@DSoI11`-%WX@luvTFGJc^`)B zNeudT(w(lgS-@APyYRz|ta2NCYHfgWwg>G{PVA$jpN@KN1xu8wPVT1(Z6QSMUeAchd!;Q zGTMT83oIJHt$T}-vKBdd3?jOBsh58-!T)zTNgS3UV*xL{KS4hsN{O1^rGVPfri%t^ z<<_HGHI;c@OGQ02y7bVberTI7Pn$6sSoHpV|zFDnOMybKyXhjoy@xv3#$- zTJG#GA5})isV^=IEIjqGz`YBfH1OHq@Ywg88+bn$HFzeBOJZy5+fmCcz%W4U}dqmbVac z$qe^&%H`^4upsj};%;u2>|{9syxcV~0s$k`J_!=kI*5H8uob1ImTx81xYfu;La)l= z(Vbm7t8RbA#ZPM{-_0@?pQh-o^B@IZU|N3bGUGl7(Qn79MQ7*v`rk-sHP9HvH0(;Z1y=R_&2ovQ z2zVVli;GQ+g#q;@KLIaGhaY!vtordjif&8j>6Z zQ3&_1?S6?ed0A@}HHd#XHb&7tuZZ-UbA?3_b2yTbRKmLYmz53rtKLFjsIOcxSEOtG zgf^qlGWzpLNNax3>4&BaJj-Q5nk*Bm`3!1msbEReza;O$N`s317)X>F{8%PK3<~qu zB>F8dK&*aM_ZcozbaKAk>-L=0N`;$wU)L0T7Kr9pJ}+xUr9g*6hFSrO8WcyrU@Jhn;z-|P zd7qJs%PEak_38vy#e43@1k(0ci{979A^9hLSjM8nFFWpiRJX)U-+IMp_%hOi_*H@NB7(UFZkTr za$h1qVFA)f)trH4!K^q)MO+$aZ*M!&tt`vdMN z(D+UKH&-^t%U`O)=GOl54ufUJl-5SM7ThQ zxN$g()1|N3wJDq2zXC%VRQ#l28_z;3-tSHwWQpX1N5w1~ok79@9*)=3?{BLl{-47X z^@7qzR$T`=fVC(+%TfVcvzKaO2}>0Nkk@(uEOY)_(|-#}O!EJ62`=hV4r(aC7UoGN zAH|z?104IAB20PkW@zTgi{%nCC{Xpa!o3A~QBp;{=^zE$$EaQf_7jk}oZ-8P#OI@; zV-&c``~F*3SQvk`h{i)8%uwxMNFsPVJK($=9xuYUmeVS-gxUd%VBBr41iUnU`qbsr zBp7fPc(1=jp(U-kJ;D19oc$2lCb6(LpZ%Nz#tE>GYr+8Lr;vVn`Pm;He6(H5jR!gM z_F7p*J9{y_SgJQKftuyyPAvZ#nG~I0*-J;+`K6TLpzb7S!@FZJldi7S8)hxK*%oC+ z%M^uuo%Gu1zk{JqO9gD4{30=lv*m&_YVk%(uzX?ymm+ypYdU3_RbIs?H6SdrkONU$ zYHsn+r@)I%HpDuo6yiMfbGS|#s44Ph9qG^x>g_1N1+6l;;U(6=kv9It2e~eu0nPy& z_L>Ficu|}AR;sYRGt&Y5be8`QL{Y=`dGG5Radrs-lQ5hd9UefnUpnKKoWe?^@BMfG z2vW_da8_$y1zg+w8Hr(l72hy<(cA^M6Z>Yx#Xh&3C{NAD#$#cAXrIb=I?MMuy&s%n z^kd1MO7N@u%H?}0l#cH$5LlBnznQW5&;LrKf6{KP=q=Z%Ou<_GDkfq|#xDV>Z<+G5 zNv-r2xBg`X3V^jDXdrv!8iua7WB!Orfan%zVOpy#xZke zQ2X=H`@B*zRamCHKl1&_{paYtY3tUx36W4Z-C6JW1qe#R22WFBlb?f6KDV|K_RE|_ZJ-qc0E%TuM#$T)m8rq8bBv!|0xVI> z`@9da+Q(*=_0x!5A?ugLJGm0+mUEH9$=-v zQt|t|)$eEAzFk*n&T<_2rsn7O9AvVfq(9tUWviBwPNT1Kv_$^Mg9}-6|=|ui5J@R5)6K% zlzi*#RKx_S{k+9{hM^%S=`W#Z&jjKgWBf6RRYDZV@0q>@v+P1@%Y%wW2;I|H0R#|E zCw6NIFktHK`W9T#mi@yt@OjZ#VUJA%s&JT)!+7d^@cD6zM zJWDY9^Mu>e9MzUV4xt5+lk}}AuTOOQ3bK%YbgT7iU}XX$tpOtM32X6_y~NF;ms*ib zYPCOC*V{}jopXxW|XDEAmmjUrq6&_^`xjJ%_(fTV$~IiyiHye zw^y++3It$v%+;k&eLKXmZ^}$%^<4LDYfdsJznxtnmXx*6Ks1+75XQ%Lo5ZizAZ^!F z{<}H#&VoR1AW{X8bPbv{6~wnOgY4$#aoH9C50Q4M@2FrO`AIV8c765US`~Lk;4oMq z4ZoejY?DE@d4yXF6yWSBBEmXbd5rp7q~nlpRg@i2aB?5}zGOe>CV>3ucM((hof!>T-5eYp*#Bxpy8UIkilJo4mMla4{MN+@7)}QGrg@Y+g!4f=ut{m_kCj zs>l=K4e!El%)De39mg~aEf$`=WA98I7B8QXi6wB8Gz+)OS_08`*4huajjEIjO5sE~ zTnp>EE?1MfwLce5ky4vu`JlvE*(m>FDg%61yY|Avrp4x>(}Y}s@q3yYRH=xwXjFej zm%k0XRSL(u4mbK!4gFNPf#qsFb+0 zerwapWGb#kDecD9arKt4^^=g0clcnERuv4ZW;JrOh>3A(lT!wj$~To-{|ZYaHAJ^g?RMR6n;VJ0Gsf)XUD{YB@?vZItmmM9bA<>!x*b3^^%@f@ z*k`9P>cRn{cSTLguxsS0XW)j0Z|)-ET`HJPZyMaoe7!~eVW@pnoy9I)e8B?yHof~M z0`-z+-9>|$pw8lP$X9x|Cek%-5O(jlDw!cumr#48^LCg%IEbzlm!Ms4BaC)e`+;3` z3V5}riVpYcEx8J7y916 z5XZdPn6dBCRF(2~bF@=vjbn%!3;zYc}-FQ?69szx-)_T1fM|@ptF(SBP|iLkNE^S?2k}R+#_%-?FN? z2U~hsDMQgt?U|@O#~P4kde2wvfr8|GeJso|iG-E1qk#9=s44Er6z13wg{rC-_KQ`-fbk|qPJFn zrF*PkQ|VV*x}WKb#^SkJkP87!7)5_aSrl(kOT~4MxzDSAp?4~p;}sDA`W6b{Q0wn* zB^=fs{}SQuD7X0wYI-#SNDU#xEFd;|Y6@BuxOACd?&lRu2QZi9+e(GX#33kyzzxX0xc)9+b<^ks=-1^0NrH&V~Cb9g`Mg_o_DR+3zIxfeAlvIUpd&=$F7g21kmul#mdN zp&_AZ^GAg>#g%E*dkjs^ocH6GLh7FrHIe~qPb};0x9o2t!c zwZ%W6z7#<>5Dv<|&WBWe76(YzFC-daj^lFmAs7;J9NgRo9p(jqzfC3bsdb>ZsBeK( zD%GPm$^lDSBjzoQx32U)`+zQMeoV!?ZVPe3cNI%-57^j)@~R=2#w1(d+F`b8&sJE| z(#pg^IiJ=Z{=|z@&~FxtPHTj{vl{|Dtp2s<1Y+K_6o8HI;4MO>fsPRDY$!o+q;RVm z>kPinDq1>Yex7_$RaY$h?Oo3_G`Rb?s{Cx(sMpwPlDVE8Z3`Wd8~v!!sP2$oe6-nAK4dZGd3_d$y5gs6siUp6_!zlW&vYVG0p9?8!#oE$_#h zEW4OQ{?uuc#nR&OwiyV}9f%Yb;OdN1B019A{1t9<=T2M@3Z|S6JNbI;aU5*8azP7z z^H*z;LixD32Yb}-N-7b%CT?q_?$rT>^ACxf%Be8`Nv+C&`tXc6cOGcOzUUFBP3$wm`Re~LqRBQ-}F74n&;b` zvx=c0XTu040EZ%0wD?M-Ei=6&M*Cd$_hZu)P!<++X-Ev^aydTtb3;&z#@@eSnF-1S zhXAI{GBIb=Sx%K@au{ek8_8OnVMSjym$BS&52LMBtjdm)tgerU!yiqq?E|cP(sLqgpqj_X%$IW$o{v zch@h7SAPf0{H=xP%_sIrc$rT|j5WtwVnV=q29e%I?X7yzO#3IcMlR4ndU~0lZIU5C zr>8gjB<{LDrOe@Pi?~Lb0_JRf6bnvA_vc(gH9!4!%biJ0;S4S1Jv3d|RN$Y7(py@S}Mrih?#8LQg52fTgRu5c>Tu6=HUUy5)$jz~x3`p|GYT9^EL8G-2YSE&Sv zWPiV0MJq56>Vg2wLZ~m@v?UF=$60auKmW&KvOO3wG#6z|zKHB@M5&iXSW5Sk(!!IU-6RT%u^C&v)y;4Po`ZzuViZNL${_dQocFOy(_CNfIQT4`+j z8TOU|de2tOGB}z~9O1vnc&nwYPAhMjH_*Suj0OYF+;*BPg8fef{fSAYjZl5XrePJS z`sr|`VDn5PQI`q>em?T2sqD zeu-wh_EnLY1P)*j*uDzIV~%SsN`E!SO|?JR9F});%pgXh!MC7Z<+dW_ zYM8Qvl+D3F@BKq(zYP!N1Pjz9K+_u+WVL_4l2_kIG;3Zqa6K-;1Y9DMolD0TIl5iC%nY$0u7fS)xnJk&YPljBucse%Z-`Xhfi@{7WquZ{ULnb+ z{=e5DTXFltxs}GBCA#08W0=FB9Bj!#mB9C20NeZf*F`zequoOUPa4pL)92HQ79_2m4X*7MTikj}r+((a@8-ntFNPnWqf%K!+m0~+!O`vPx7FZfF|vJ$rb7%@H7>VLxt2B&#yL5wz#vPo7TK%Q zSh{ddmnI1|kMROcu80y1I;3NCX267kYj7*u<>DlQHlfR(!v{a3tICb4-q#v#uL)Nw z`=9>;Cy?eFK3rJWv0f5n_6g3ri@Ch;hxW=quFQjT=ZKh@aB05JR|tI?jcY{4Us5? zIFyO_*aDT=Ub*4q8Lq^Y-4?*5J4nJ%;4pN-JtYi!WGZom!}C`33#d&B7BnI?tpTu# zNd&0r`*%@u-eb@;6de>7HCojBH)!{*ORe?*8~-h^D7_+-s}sG_Eszee`FtC6E$b>< z6-0JznL&^fav8{TWb;RLlV@n&F2f*uYOT<98w{sX`zvas0|4Ce0hkQ)WKj7h&ING0 z65LwJjYM;}9y_g&tapI#Yqq<;ZB z0G+qwsT#Ci9NKUSoWFOAoK6644ztD8k~BmoE>|tzN8aN6Hb8>oWx2o4wPfNx_|N~j z9G9srZE8CSmnkBfHiTd3s^Qda_fnT`qc6o((2g0o=)StYD+1sfebvA;@4QOz-h6 znf;ZBOd|jn=n2k-PW8)rd%G_L@YXig z1xCEo7LA~8MF0$03&{}&S}9xQ1kaybl`I9stPFmL!ctJn1lq0%Q12&ebbNMt1x^w2 zoGqtlEcZm%)IOuzTaiK2#Q-oJf$mY+NbA-;o3p(@tiH8#q_0vY`cey;``P&XOZ;7y z*38-ESupkD5Sicpvp{6emD}WsMlGo5??KYT)c(k?YeN3>q=SDX*uC6?$r<5C|7$wB z)qkDdWliA<#cxI0pTMp~aY=+VO0cb6qHn{1K$~#w=La%E^oISTs!PIWaQ3R9OU0EX zA{kl{TU(B{oVNSzx=M)qF%*VDz?2nCTf9Pg5eZ?ILH_>0il&%sY0WeBtrWzcbBTmZ zg!KIl$s(6^`Fv~XUdjS7k_E;Z0fFY&f~Ka!_p_0i2A0hJw?h^?I=JrDdaG({H?9ws zdzZ{YE9MwOU*^~}dk_Tgh(%*vQ;;r$Oi;so=pt=mG&B(5*NbB@#f4p2VF{A_4zwuA zXE&#i?vyv>zT~ggUza&P>;2sB<}ND5zeRf2->iX0w1LJNSZ@dPTL@rtvj1E=ZN>$;E)lDNH_UamW$o2>z9%M-L-DvWBC5G0J_GMxP~cd&5%}|*Z|Fy zI0P>$tU|}T;j}t;msl;Lp|s*(p<>enA%p3l#d#wpyp~RUPr#SizK~w1SWivB{}<=i zm}^iG+FPseH}U~4)-S}KB|pA_lrw-3ga$v@Ta^nSM;4| zAN+`9QiX?ovDViZ?aEJhs>NSfED2K8GuL%@%B(Bt& zszMTq_X0_{i@hv+AGNE(qLY+rDN7*<=M^>dLWsKg1m@!N&y#+qU z!ew5#*BUQp$Tc%PdkCmIg=a`IKV52k{_}r{d*QND!^o_un!d-`BIYnZ~tyXqe{}q59I#L@9$2e7+UIOxL5T3@I9*)W45JNjKkhcpFz04Rk60 zG=(HpYkAL-(oY8a_fbQZ3r-IRS+{^a3z%e<9fv@KfSf~&pWPzv$G6D3fRJr%@7BvI z?sia@LL&Fb1Q7OO&QfvxV!k!P_I6_RZCkix^D>o_4Jr#J?l3NayJ;;gQzK!{`&mE$ zc1v0S42{se{qLZlD@G16DFXX^S&l5c2O#dzXrQ{v^C!;9mWFa^etso5m6~&vt)vn% zmEktYa51dpT%oU!^z`6A|I0NJ5+Rk1es&t@5UyLo{kw8uYU_5d<^N&!Q&S<(S8%6{ zkr~vbe_g})2d~4FvH$!3-GFE2nZlmsxvh?(t?p0kV;sV;ro+N$G5Eg;_6NRu)v~1H%G_=U+ zfZI;3T-<^!>awP_j?WxKo4#bQo&25(xYWjfyRahU*qXw!LZ1!7Wyjc4V+K;PJrs;~ zET4gj>ly}7%J$=JOFN_jMb;=3EV?E+Zv#^}+ zJLa77l}H1m0ffKZGTjsHb024*GHyp4mgd|*+P)PKfD5cy0AhlG49HvHpi=j_Ok;{u zFe=uxtteak1*G*hm_;v%y5gpogj<^J8j)K=!a`U>gc^a&o?*^=MoPV9j!sth%x}MH zt*RBihn!lfZt%0I%jx?+m7PKAel|I;$^d>ox3IFNz&4f;R>R&9KFOdCreISW)uo~= zAZr<*{wt>NI}&tVLN2bb1py*!x88=dqzN1|5WiV{4%bM-I-8lDh(Jj@SG|Fo$L@gy2q7&2Pv#*e1eM-tNikFRb zHv`@5rpnMbsUdSJ)J3n6^#wd_Gev6<1GNDdTw}QcdrqALA293%z#_sixxU5ZUUVtX zCB9FFdWq&a<0W`a40XN$S`}a!m-1nW+UqDlz$nrc$^ZcX07*naRBFUe^r6i5c|d94 z!(O$=#H~gpn0yN1${#f_TcPoin;TxmWelsP*ggpV3fr_R${_`7j`$b>Doql!**#np z(+S;Jh_nm#{=`nH(se4^?%RLx33vGzNCShHr-Hngn3)0Av&DJ>xbTDRSL$ICsEC=D z_m)QclfqPRe8f>;T#Y?tu!7x*Jg)6h@Jc%vyjwRziuViZ)Z@vv-=$- z6apO?p4+%%Fa3!?ilmVd1Q1qB*$&aGTH9f~7APO_FUO+sf)7yut*rfh z_EJSo+>K-zHCO{`?HsDC-x0Xj;DZXPgbZUW^X!WVe!V2O9LClUSM1{+LIc;a*^{N1 zHq+yCj_dQyZh!2#6s@;{y(2rt-D74(5w_A{xUIkny2o6V8pBrq-kbtJgwqQH?-K*= zEyHK)3g@|y8pyf@WAi)^>HO+kg5s^OGv!2^Q+GtONAz~dCA-f9JA{FgX9_YpDYmIC z%TEnD7G0tqKby{7=vi)A0!%%X6brhwCk-45mc~sk(YYeD2(npOM_27bL6mWD!zcj$ z3E3Q>^FwSNszh9-_-j|bw-?63sAAe)PNZQbNb~mO7N*oqvlh5crSRVkMPnOnw=c=c zLBtgF`2|EW&j1E`Wl!3g=(o}1Bo-}bxi3u<2O0A`YYpEmrpra*nuk$sF~*wbCZ9Nc z+@kO`$xWi&Pd$mobpy zh-Vd0G#+RcTm!-HbYiXLc;inIrs^PKd&#K@9v)();|f8!l^z@w*5+=hs7`@$fw1Zo z%~ZL8ol4dbH`r4cA}Y7+%4D;PPVZ(`Fhr1@jpaCpetl>OD8Gx%0*JR}s@EU3*Y#$s zRqR3n8d2XB1C*(w0&k^oVm-tGk|lp?KK-|ypyFa&^+A8NLG@8CxPL_?w>wW|doxMb z!B$Li0RA0S?I>5O@Zv?8>MVfJe-&f=4UjTV;sv zWs*?#S6WTJC&#UfxEqI9bS_FdNvgWw?@}mi*{rDxBVnlhe=05?TOh?pe4mK3|qar&h2=`1Q`D-Ud_KqaYN#jbSg&TdC**{T48)pZd+( zRVDiAkL)c6=SznL5D98(crcYEiA|YETFd15LY6H+`?nZ$gdrWIowI~i2IxSHd+C!n z9p1LEwG?5kA6qbiS{$Ax)xQ(qV2!w9XSNS~)%y*5tf|cz-X}FD?^`Z_p2&eTus6m1

JSn2agwm zUXj^D@3VA{tpM6GC(p<|Vie{c7Z*QG3&03OVNDQR<-CF)H~_aTn^YJYWmmQq1uVe@ znEIFhDNH1Ki%68XSNh!;u}Y6mE_%@;gtwG7x3W;VCUXC@$~}|kTj6{LSOGwKb)T!Y zNqGrwS#!c)7b#b@rD<<^!slCy=gJ|3u;8%o{=GFlq?uj?a-_EyVan_YekZs8M><(W zQz@goy(}*~Bcz#^LM>Nhw6*vAm!3p(c_My(&(F2*4EJr!zF!9LbuN5ao58ahqZJBr z*&(2D39<^}*nm(`7=0HdKU6T8MwDdcDC|4>*82PF8!8Cuye>*2luQh}B)ynfI81d~ z6G#B6GBFiw<70x1Cjy2As+uDE-QX-PK=A0VDlektR6)g3fmhY`W?w)nxCabw{!*9n zigfpgPZ`=Qaj`XNUMj!Tp3x?V6Z*+TN2&W+I&p+fSTN*%`Adg;j@Cn^q&h~0#+Y*^ zo1betZLL_*q5d-9=*4+~GS=o@VUzpl_a`*Y?r%pJc@98cx;iKmZBeqV8I&Kygw)MBRxdxgc(Hi47^$GHs?i%2Qn;g`R+qA1lh=M>>gf%h?`C< z+x0AIw+Hm%6lPaY3^6#wB8dQl$2C;fZeS1w$sn*7L3mLq?Br~@P8FI2ZhA-L5B{d; zmWY^F-rvd*t==wmK>xLqMHbv2&jw8|z%$U>#vs9->Sno5J*ntn3$n?S(wLP zc*U?e1#-JusNZOm^$DIAw97f&h#l9_eaguc!wYuJD|5#ynXgJL^U~v*5F(9-tUD~B z7J;*sGF$)oU*#BzDgc50 zfa)n@>C)Eiazg%KDFJf(rsgbNx~)lYhIJ(%s8_`O+Zxt#cn07!xqFzL3)bj1gU}SR zlD7g{wQwFT_`_Qk4PlBIrBhzks8%;;<$tqT%*DaFUGXm;Ll?tDwW$HI2d)k9%L?p? z1!fh?)vxxvN^!x0K`uzM;=Y0|D6Sg_BCRX=?NDZ_uSR7iu*jciPs5e;GL#kPd25M^ z%aN0UoTR4G3*KW8L@jm7 z$@c?}u$8SEk#y}mfqyO#(7UL`G2foQ;TE!@pL{L=HK;Ve+ttV>%ESjErQfOe1*gLi6~l?UkYG{e~bkSxK-05^cv%*k|jpp zy%8r9NVWw{_Tx$;YeG4N3Yk2yNPhLO*8Kb}D_ABUBv`(8CvSKvOmNrVhX9z;lP9;f zc(J{Fs!tKAyIM7h6r{)6vx+gz#c=qXPI7=Z%R+HU3Lc zV9Uj+^hM1cCm;ox9u%vHiUVEG-b)4Pil{uCyWH^pruyD4)3>r;HHpDUf)J9J%S7SIW5SN{AWW z0-+i@=RUVr2rVQ(0-&3|-@oRtrYF6MZd=Bg9)KYLWs!sQ-5U07L-p32DY~}yrvxR> zPuy9cm@U1BxP>I_NtuwTOL5B44+;Wq|L}di6)oh_1sH`UqV1e-8(hjLhB3}dL~bjd zGtbN+QlmZJ6DFi4AcOioffluLD%T`Yw#SW)LO>S5AOW{va3mS#*_cGO zo|l4G+qHod&Uu-$Tw%|mw&?Oyz8@GNdW(50B;HD|wCt%V*@`R4-MEj%JuB`HWg^Fh^`HNhvK$42Plhrbwuz)- z?dR1>(`J8r7-@TIqir(*+gs3$gqnqjd~WT?62n^tP)22nbphuyplqCV!!*g|N;_T7 zxj7cIdln+*bAQ4$kXjT1LYG=ym2ywj)nzTWl6?%zXM^GtHfPAUSV`y0Vv!AP63VR6r+pL?C%P!u+D(tTcq!$c(wCGWS5b*OJTZBPzG@q+1`xe3& zJbXJ=Y_Y!I%P?nhF6rN{?EASe+j4!S*)#%T);CEdlP8 zsl1;7(aQiUhBe&Ec{S{$ZlM6{ve)WiK(N1J*{)6Ndl+OIv0O{eGo%H8NbRtE?3%Nk zDd=-*6bNgf-Y3h5QII)IK)mYPva(bFVyXNZBiI*Z7`2MajqP7>Htmtb+sizcR=v_K z^So^;Q`6g-6HL`TaTbpp?6wvu{RlN=v=jIlYM#uo|MP!wZNEjj3{!ZIT|obRzE6lC zQRL_zIGu!hYJ${nxBGnlh6w7`{o9mXD{QBOC2qS`h~%={n;dY$iql-g=k`=ouP_B{ zLoi2#pxxgyAL5L0(i1Mrb3$xbnDZXO0EAq%fk2T##F9CWk#MVD;S3X3WSB^SqhD5K zBr~v>6yN9jH7-sTJ@rp$$UNNDx$HLrTn7E4w%NyE6k$`|=Xb!>>l034N&%zWA#N#^ z%U=n_`$U^c`6u@7^A+;knhuS9UQ~}iJ2>}PX1$gCRJLDKyP&CrLBymlK!(Z)a=kbK zYr~bdxs3GweLK1ba7`Id6gt_~+di={JR1vVje8gn?)`D~h|@icVWQ?};sQA6A3w*x zx=~}X{=U1!R0ZVXXCBLE1e#iGaiw+D<*%?z9-|UhR_TQRz+@u(EJ%kzimT1hj%sRv zV(Q{}o+*P`;o^sydx1T1N27Vhdn!oa65dX^uymdwEX#5gCS`u#uNoH>4r8ub_qUp9 zL@^k_C~U;^`%D>alMMAI%V!I;-BuE&kjN2TFv?p@rp}D3lb=Zd)^Qyd>uRLWI~f0Q6s0McE9 z+LO!b4QN+=M4YLEjr^{@DHuOK_&bJkCN}YlW8>`C7-s5YOp>k%7l<+JPR;E)>qfa( z{2!GYIa9Z=$5{ET30Cr0vwht5$S?QOW1n13s-3ZsahRC11oCXIsNuIY#JG1z1fHf!7<@zs4JzuZSP6pr?!{e-O|)2UXnD zCiicHU5@(6Y#gEZcraM`tV$SdOm3#SFLsGvmGH&RYH8#evw+ht zFI)R^QRR=d+FeokznMRu`knuSkN{CYuD|7|Ky2S&lLSaRIHQZ7 zt65ofzLxWO4Q-J_dPiO}s@+>u@X3X0VO+|Gdx(5r0k{=!;cZ-}I7sk*e4Lx~c}gX# zez}t_Vu;2OP;=4o#d*{9}RCrrNc}Yx(@d^D4^8{on0(bu1^A;y#Y@pfZ>W82{7DKu`3rB zIBUStqLVL7K))mCS^e`O!Z0>bx^e0WTDfYpF&I%Pv15?wQt)+t?ujxrH?qYR64NUA z*2bx3M+s&oiu3Xds`u-$oeS3P9$p`5(Q+B74d(!U>y(b{YP6!-J+1(z_JuW{JQS)v zv}2|*XEnSJrfy33t1rAyO0*A%Z~B1=BDl=uiO`|-i=D#VTDhnbsryYWdt8g;YVF+^ z=F6}cZK8wA!kX4mjl!iv^`b~%tMWbt<8I&;a2kTnhOgomVO%E-PJ*b8ZL?I`pQn-4 z5|NF>AI7N@X5%Mzq0BN6VzX#BFPAIh5Gf1rjaYjT3j!hIF$MZS+3J`ei=Q-J6hhmt zdDmpTz0>5!{dT8008|JSB#7p}x4+L#Dvf>=?M-JISyb5uy!>p_s680$4+mk>AM;a> zbhRmZPd7i3z4@b<5TrTzI{V-z)Ek#dPL(fBN$8A6lya^wEqM7S7VAsy4#8)^WU*ID z#6=L1%tN$0_gzuXCC0%;2*DE+H*%pv`g{Q@=+;#wxnA!3(75Y)##7>`7luKAgFhlW zfx&Gv0$BQ&3u8Pv#-p#9_)`UJ_FdQBW=PoJt{u!uO0<1d zE38=pnvKJlv@sPG+J!tEf~Dsdn7yhsX-Dg_5Imbg=i}<%o_xPZY5u)sfibZ_U>=!8 zoLdo56*hS!zr`{ON*>WlsU7=)TH@9F7jf5rMxdVJRRo9HVd5U|-ipQLPK6HY_1TBB zy3nc(4cOa1^+mFrl&i#mxg7-Ptx)rd%hxZlyM8ni*Cz_{{LmJ-pH0aOcx6`Va%ur7 zTlWg1Zjz*|M;`ZHJ^QBv^r03Y&Lz|Yui(y3p-@p}elb^RvdtvEK49lhtKnuMT>z>6 zSIiT6Wx;vy=tb+4Pa(mkX>J-wM+|E-lI3S^%0X2j*gVQtko*CSp!CAt1Xud;xg|0oT)W~sXn4>4wvTCdm;Pa7NtUTOJ< zEaNLAx)o0A$@^Daz}HA=MvfJP&sN@teA5S(N~?S;W6Os|aF3D$qA!dR0Dk!e;3Cv5 zdaz=|es&_D@Y9!a!oGD`=aF^gKlP-M)WSuw?BB2m`u(o;o2z@`Ka(5P9>= zkgNDrLI+z)1?2f*@$B#bRTKegFP5JJCa)qtD4+>_n#QI|W|#~8=&^sJ*HVoaQo^Nr;6cs&=$`i@YX_;g_QuCnDo?0l-sm$Sf$Y{1-e7g%|7kE+w-b zESaE}nn*kBb`Pbc`ioo_&;d%;Fa zC0=ZmO4}klnZfT&0EK`4D-i?4(h{Pv`i(hVvj?^6Z@Dmd-m+V3oP`YC6Gj@qqNB)k zrCw@_xV^uViGA}~5YxIVMIra|*TiqEhY(rZT8&8NckVZBAny3sd9pQxlK4!;pDO^a zHN+A2Sy|@=ZK7Z+()@-LqyqP*<}XfUq?Y7+ocxVmv6G+6351L*K86w0Dse-D3XyFW zU@jQZ75sSXx#B1+bJ>4S(#d7r*0e*$Li#5_@$9fiHTC`6lefjUf#}vev~F7w*R2ge zCaap~LVJD!<2?I4wIS5w!j_3}6cYRR{WuFex*-DeNUH=SH^1D{Px!pO?c=%@e$pKs zpRGW^eSRu(B~GPdD+*A7m0HW#1_%!;&u(ROEG)QnWho(rO<~jvGJ3@@X}E_=g}RL) zLh;t`ybqy^3WKttEP>~bXSJp2g|>{C0;46k;MqOn-rAzcE7rGVC8kU>a;0z!^CaLB z(VPTwsBEDw^SiA>P>50RlT|~FLcr%LwU~}Z$M+bZW!HTM{qrF3{=g~{;NVZ^EjOkZ zZ2AXupmC*g*3^PqAz%i*NUt{j;m>~0h|wyXf(sX|jUwD$2&7|?#x}RGLXC_vYE3ts z(k;z@_NPlH<%o884giXiuA$J`gN{`>(8Di|Tp5TTFu-?32tRiQXXU>}acR}rE^>;a z{VKldCq-a+lfiD8km6wJL`A;P1z#zG>vL5@*4Gu42p27vzTghBY?O!+5oF1%M5Nl4 z=?13f^V8-Ubzff^j2|(*rM_H3yJ1rNmHSoPyGtFRx9j*9Gg-1xxTyFr5x8QZif{WX zQr7{$h2@>?V%2ELiqEPdLsq^yMXW2&WPLXK5Mn;4HA5lMRJ0rT;(c-2|g#|QlHUsWmU2dh12i?jf zVl9`L)g;M|o8#K{lFOq{IxHdNlC-Y)W9D_Mwc0}{7CWhFM ziAwNkQ}GqbZbf=d{*6*?S2ny9C=5t*%i1>!5+*|+E~6EBFrcEyR{h+B2F4D*#A>qH z#1`Zbv{A?oH=F64c&jIMD{`6DACN^_Yln(w5rXqAURB3{LpC!3Yc~?DH$WhZ%K8Lt zgfA2RLFs>a=wc9UB?|QXj{P|gwm^IhDiF#9TyP+lowvXaqp|qKKmYgp3<83-i<$zl zigFLa`}f^?)*X(}FOU=3ehe7t!EuP@o1QidMm{#6{aoE5H|4X^S2fJ9f=)3=a;$sS z9ROCy>1D-u;PEl_qUX%P^i)>f6B47&^rFH+ueSC~1)Q34zbAm{0s?4Bjvy;m_P{bV z(gf!o|BSA3f1g{ZsYO<2(*el06D*r9e3`$uXlN@YojhOB?K9CrSeD)j>1Ql$&Ok6~ z$W>j|M(($;RT6G3QM@|tx2B+x zxj&{BQ_EV)5n+o9*1C`sL!&og_)|&xTR+?o$kL8-LU5ri3|IzXrmmdN0y33cwf-7_ z#rga6;$W0Bu59fw{F!=dRNrZd<*;xD8t&oA}>3f)=K<)3BzK4tBRT!6Im*twY&qgr= zKU3cB4RMaA!@d~wNA5U`y_Y&`;PZPCqZ)?uC!th6#M|4z<0=-b_Qo(i%_fVMU?Kx9*seN{7 z5Ecrc({?SFz2Hh$Q4|rIX2O?j8##mLhT$^_V;pm2vT>sUYXuX3j3Xw*_4{bbK9h5mG`RT5pp{xE{{2b<#B{ig9=YIW%QVSz0hD`Yf#aW8XB5_WL(-&*!ayLTx060q{#rr?#?Thzhi( zaR!wAJ1{jJy*1^gX_te?si|1E`%)7Wh?vqs=HSVa0JM-TqK4@=#eD`L%urmywPkk= zrrp30d(y9nX)Tp4V%v70^m`VSRnoT$8$=-uo`E4yw+uPGNe_FW@5hP(y6)L$T}#?> zX;IPl!IY^j?`QXUYd>E;naXIM)qcLc)Z0Ixn1BHYv*+jSpe!Kd6!*zao>xDN#q%0Vw^y~51*J?tmo$WRpI4PI zLKvm2%etS@gcVZ;_lNh+_gON-8gXl$kpXIHw!?Uj14q7=^QDx;Uz6>K@ z)(IoV3s6NcC(ndnF(L%OebVKqqd$g&7^xbq#Es#ybO?$)_D)EV&`Rh+sooW5aknln z@L_`v3w1v4#d1|VG`OWIMP}tKCt&R~7D%)UR4|o&`@uF0(&V!k3q@?7-133e7IC6& z?*b9-5CfPWS#0_iqm!|wi zFU~9TJ@|WJ%7@Rdye?(?~ zE1^Hak<6NLY!~q;P2ok9RQte#@&n+zSww9~jo-DlWful@bO_~->$s3rco2uK&0sio z@$M);i*1cXY4#VJjg_TgPgBn?5f{MsQY3h&sc=_8F@de-u?G^Trxh0z65ib12VdG# zhrfT%O)m+)y18Z!gp!{WurtYIF(|kyBP@gpg{7b}hAd1ac;OQw8Fj%aTTnJSQwh7q zAnq6IA~j;jt0a(gCdc_tk(RTBIf)?{>Ai%}tX=!9?Cwwy2-XAV;aD51sAS)}!?W!N z-3+mug)_Q&G2X7QAIaWHVw5hQGnklP3$BQtF*pwJB67r;a1%CEdt&rUdH7>7X;Tm|M*l zq`%M6eqU}usj^=$)_h)VkkB*BfMXzkacDC+91Fdv&7?oTfm?q}rd1KhlXc!pb~syP z=jy8z=HX9mx)(kc*5RbyA8gya3J}-Jd`E>Yfs837lsaWLl%rxkNssMv`(|fMez$;5 z*JA!q1B9zh@8IShNjLH*L^$YIiX*yL+;qY5mp*wH7H;m2U`5>x^W#UIn5yPg<+p?? z8)5gc^XY|_st4<(ew&S#!lgMjghgs1B|Xl^AXq?#w0O6NSTDMYor@3VM;_CIKZEYw z-48#m_L=1}Heu+I=>u(WYgxEcxDhpC*+0Ghw-KS^THhaV+GvD_*Xi>6$D;j1q))RQ zjW-v3d)df>3oiCQ#seVQ5`o7t9jQymTv(2tEp;9iq_n@6t4U{+&wUB!G^_L0=J)6{ z752~ySZWqkC7u*ewM`e(s{v3m+f_!t49XjjElo5${$iCzVK-rX(l?1VU=-EruHpqc z62B}Wa0z-aNior|!FpqJMoWAChW^=Bt1KF!}ppEE=X}Gs@dZvYZF@Gug zmv@`io$~+VB+Lh{AEWcl#fwyQ7`qYc1Pf2X`x0-noBb}^M-ggE8hD8*xm0_!wZS0%Dp0$V#@&|Yv8-5bbT93C0q#cQ5 zjHCd9@vvQg@x!Fo#ABZ3sVp1x?kd2>Ln^MVJeeYQ#;069h=5g@`rJ8xa@T5Ks*^n+ zARnQ`#@Vk)bGONp#vEQilE1YGyeOgIVS~N;@EpN_c5nWKTi!Go)}Ur`mT10}5WxEY zJAIP+3I2>~%?Pq|-Wo9tok(Z#xAoTW2HCo>I{{D+B@hhm%hxxXk|tY9ogf9^P{z0H zpom_Vt%ImPt3eLgMZ4-a<>d|6O%wNlgSzES9N|ssCE?S$f(q=q+>_WGBgtx~1eEHS zv|FqaK>qW8ZoEh47H$(O1__KXdlOOP*=28WdQf2-h&0ULq)f; zJXeMAH`l!`)DvA9f?}B+=2d5b)ILKdB#=ZkNX4v+N`Ftp5#ZXYDKD8lTi`k@8Zn9h zoX)a>z+o9x19Ah4+;fRk-MTqpScZRamoKEO;zE5-OaLfo0L^o210lM9zOCE3D5;cn#!o&(jHskvXZ*Wxf;uqL&|&{3ut8G*{*fBsut)hIp& zhE9?!ZY;8z+I$*$bGUxV|d)T-Cjw4jU z`((PlDo1eNM5G(rSFH8(Ma4kb=PG}H<@9YM1#=|`Tv-N(TFP4l0v8R+%sy*;#%3*} zLgr`h(lt2anls5+bmWDC3KN+~txwD$%B4+Bekuj#*h&> z+~NqE&k%zXS4O0N&8EF+^A}~}>i!u^(lW8fNG$_#_1k@{4N(KQ>N4yMa~6=vWqjTe zmNb9+DjmBJr@;G=!|#XFE6hN4?YjrDQ2PEX&lC`tAE6y0 z`GcQ#psv&z{tw3m%Ia<+E&CgKB8j1#6h+}YTc z-)uqFEuEa&fDC|qR|_S!8@b;AN|K|J3aG?|+CmbpZo2{*MB67xqG;H)r`*m0@5SU?ZcmL0qQ)HBL>(dte zqc(OnAQr~5)&^DJjuz<^*#RLzZ?rHYP64A7ebKUms&XJCi(9kr9Zb57=t0`EuUM6$ zRp;8*3h%6=6@%CgMzuBEHS^}t57;I|^8z5q&xv>-C6(P(DnHwLFbmCQt{(@3X64d7 z;J(%@iJlF3*L#^F>}UZfIG#)9iK&1~m{)qRh7rIEyf;j(a7Ww$Yj5b&t95#jyWMo* z7aH<85|c8sOz%d(2t*oYT2;vo7Q01JB(9WrI%BJ%h2|qhPn+?sO8_^ z@L%_iCaAGOvm82wocny5freWmhpZRw$;u3?VMDHF;{S2a#EK39YZK!{Vrs!eEt0c7eG z5k@^hXb}6ju#pBKfG7+KmndT09M)8%$+8H5=AhvIw}>$1UJg*4XOom&PsZ9#xK^d6 zhald+WLZD>GA@2Ud-9TUI=I!IV*=A!WQ!|Vro#&A)yezh@|l0_yIQ#4&(?h=5RlaQ z^wgzX(EFFSjhVcpcc+&QY)t+&yzKX zVw5DcXvb$wV0){ODJ~(J5$^LjAf`~+Gy>PHP`BKEKZ&*$rA;rQ-N(K3ufORYr?&2A zAXpeh(^Cm+84zJ(T^OWed@khP+$XDobt~ziyS`8Qt7#f^C0cp9Z<%GQ#sWeuA=g{R zo{E8Ril*GJYAO9SX}Xq`Fj4-$|8skb2|+7U$0KnAMIJS#knN=wwL+~NnE~l-a)FS* zO6hSqj~)JO!8zVq5zIgTeYuTg!J&zNAI?Zd^nH4)o}53`l{6`{-uzu{gQnZ~0Viq+ zIL`{;x;gUE%a8=DTZj-=R@i!5-@~bYxAXG6%?Uf3nr9RyfP`B^8MMkxZf){pv}d)l$RIU|7`bkx z)~d*rTlv*HC;)!>m!Leg4D);`9M4ZgsG#@}$T?wsJG6pZlmQf$aGv+vgJrJ_Lq@-v zCpEIT`iTS2GQUh*8H9i8&dxJdxo$7SA>9Fm1^9Ma*w-xd=_U-|I{%P z#Y?)DXpAn)0vOK+BTWYZK-4W5h~Ov7d*<9?I24`2YIQC`B9P}W!nQ+K`VY?%-%ejP z&D$Y$Su0oQx**DN^PYRGf7t`MKV%)WnQZl@R$3U1ynjRDty-W02A~BR0JHt;VOx&X z34w*Cky>J(EHQnXr3nyWWJwv-PH%xL4O4U!L^KdTd`V>sk#ZB&s2rbIA>k;!3FNbJ zEgmoW{zzwHHn8XZwoCmzeM1u2HVwDI{apQ^{B*ec`9k$}2;C;IAb^y)3O=tg^$QfC zUVXmh`cI4&;k{8>Tn%<-f3NJL2z2o8y&XBtV*0a|f=%osHxtq2uFtt)NMzkl`vny% z+0e-beO)(|=Cs~Iqz6u)Z;iwA{dIjJTvT~)qk1Vz#G{&+V=nPlB>#nv3V5kDM3FKAjF_ zKz2q?3a4s5V8vCHxtj2uri};Y|3{n947=lHG6i;v;Dzw#1xPpELM=_(aQ-Nw-Yj;G zAjR0AtZTN@KX(0G5s0lREtcA7 zZkk(+8wZXLsk3JITKi1z(J1F&OOSsfv|)=RHP{U{>AE$*mg}uz|DvIrO(|ki{hu;l!0A` zl_qxsq-qKf*M*6A06%g`*z&psn&|S&01Fj(r+k!BpuZbMuc38+cLCKk+Kulrx2u*K z;q4k2r>5(tEYeY>u|A)5DWtsgs(ggL0O9XTmt)_NLivp zuJ3G>dDu;6f{fGL@a~(CbD6+N(Sk{dRhL_Bky_r8!i1vc`)?8nmzIk}LpNB)kgD+q zyZi?!-wJ(_&!Q7#cY$mAWQf)ZL0zDcL<>J$SNOGN2l+$RlxsG)Snr?%hHdeAxozzD zY_5zrd622!tfmYr;aB|30XhI*j`gG}Jj@QbKR;ed*ws7b@3Xcz1Zg< zr|ez=n-lpKRyT$6<)S*N9hOKkc>c_94tC-1<^udn!orpx=2uw}TS~Pr}gDWL+2LJq*MvSE;fN6ldcwPeM zZ3MMjT8If2$3`(xwoO2bP+64z9?;8Y_E;lo07}@3 z8WDVPIC5{lEc-DQq^9OfKJVfF$P~C`Df(;BK!pRzG?@ZJ z{cV8c{S1&PSI}>Vq;l(fgqM15Y9InekD2rK`!_g&g?0VjzZN{%0NSP_>q5{7=RMFX z&F`%A$|~&d9FvbNxH7FJ$}srL?Cy`y^vK&WJ6H`m5cEB7Tfp|7>9>)fmOw1mRBa`{ z+kXfXKNbuyJ<9GA2tSId4Z%x@z-k_Bs9}MOf!pYpb=#~M+;au^aF->%$0K3e;B<5) zV5o##)d0W54z?~WXv7+(x3|8{%lc6u(%dr;Lb!wJt3=ZGldB)yeBJ`LcdJ$bYO$T$ z{P(J(Ip|Ab>gV64raUd>2IoEqHswZEtAK?OM&ht;`}soJ!k!!LnTInxNjaNd&MB{I z%Asp(Dx@;)q@dATHRpf%-!FeFVO$J;R6=%&Z-YVIW9><}<$wMwmSRRsy*fj#LCxdo zRT10#1IvBc)}Ti2^}#}1KyUvzw`)1QC$9XAiFIQsli@8v!_a4BxcsXr&5=+SHISQG zDAD{~7ddGp6hEcy%4*{VqxX5${(t`;e{kbU+Q1b~m8aV2^-o0zx4G_Y1zTD%OLD

aykYeVpI;fDkGZvC(y?;Sh#RtF)J4I4n~CCRG~cOPw49##Gk(IcpCpzR zs)?Va({N?b-b&pHfTGPxJkJ!^7cOCk1-AqLXHlR|mr(m~m_hTPA)2RZ#(XteC&zsU@g*p}wO(L+AZD!~|}{`Dt%W!FAB#!R)9Z8pH(X-X7A}U>D4M z>i965s5yg7?J3Xyme~@v>`S7EkT>Pm^Cu7zBGj(RL6%^O{lQR)$wYaE==^WfjkqsN3>*}<) zN%j`%9v~D>8tZ&n84?jnp;J~aLvB@~dsD*uR~JehCW{ny?oT7#SzZd^MSw*nwGu0P zO%>skfzaF1uZ9)cjA%i>*~|uGsL!!(yb;AMzH%=Wa*9~VZNSH^O@TRZNj~8$vv$>v z?B|jl8_AE}0;%{_&PRE?d(=PcpL|hWUJ9kkDu5%ku;4H8_-f^wG6O(Dfv^KUL9lrY z(5MdmPhsVBjn3tWmpt+I?3Z4!GJ0Fb9FzkNN`Hhz3sA;MusJY>1y;f%b-ctsR z5rF7*h!;+gT>PzS0#pwQsnkFJmGiPz;Fd!Sz88^c77~co)BtA*gAtIj95U8|_qk6{ zSm!L@pYPDqu;5kuN;UqNH=p~d-wwbZ4pesnEw2+3|0b-v;wsLy#HRVnRChV)K8p<0O(&a zD=UT;@pB;{Q{aGTgr;lS0ul>uYaYR@@U?g9a`OvIPsQATeZd@w6QS<@ zc)<7wi?d^BJ3yKdONZU4RxVLbWGJb?h@#Hc?g+dg$E@`6NR7&7N8D`9)aE%@D=@Za zLFnHK>bclFC09=Fc0+(VhYN?U_Ou+KYwfS)LE$9KFvKSoKu_hdADJ)AZ@mOV3m|U2u0-}q=Q0i7DLjdqyOxmo`-=AQaN`V$N zvhmF_(HUalEayk{8z3Kluh_tC`2F9#Nn>xEU6Dtu76$MGRPnxm#X=9gl%^t5W?z!B zm-g0@sn*8)<=Yy1HNOzIYPo$E2c zpgjJPkLtM<9q^5+mi#~}$)Ak$CLB&NIVVela^bqlwYg5hdg2U2(3k2Z8{se;xM=_L z-#}THEd|G&y=b8%t*kLds?q07{ojde6R%0 zTP8z8E6hWTVssD&nj!~bza%wL+Hih;)bs`@%o%~OKhAG}8KmVs0PK#4Eo=8#v$5o& zrl8BxuofYwIlTaYG6R|9)~&j=|NcCmzdAuo+Q&5^qdDAY&i6nF!$d&}W>!G^XY|+t ziSP`u&zGI|g9WD;YJ<8gEI~}0sMY;!Aeu|^KJMYaS71#u&NI(jmbLD>DCf4fz%l0v;gH0PiHzQk5J?%kp^RZE^D%%G?IQfkWY`L+@3RDl3`}nGe!}ol?}3o1u>Z~~wLcXIt_Vuz<6%J_ zlzZAs^eBXdt)y?E2J9pO;l6}4%$6MV?LKw5_;-cZ3Uas#kbYIJk(_M{ant3^gID;M zxwr@Zg}lcs8*GM7x>v)fbC479zbpZNY^wtZ%vAS>e6s-w*r6t_0$ z%GUS9m2F$ZEup>T6*awm0l)>lHD~d;X*hqcHmj;^+$*izHnsf0zWX=49UtI5zs>I6 z^C`8AUy!pGV*~1}$J&WJ&<0eIJ4FR|+ z{OTEtH1cDrloj0~?%@(z5cnT~Kckn1g5DMBL^zu`AYm0>9&9Cx%HqU~U?HT>D=q;c zHA{-4U-qDe?2L;;DyeW?(C?2%*1u{dxe#x~pAk4HW;MrR?B;uJ-oHWw{?)mJT!GJv zbJ5wVzYZZP^YcS9#|l>=kY2{zreU-zx-7#r&)Z8f+iw5!-=d@#w-MrgZdhDe<~;vq z(db|Nj{fza6@INfkRuD)xc66x^Q;4{{=NL~e1+w9xMz7kuPWr88=L^y=cU^qwUvai z!mU({vaa^vHrJvLjDE$`g%X75sR*0W?g_2kW-twdVz#pdLrFVE*r{8IhItm8P6#r= z;PFpc=ZXmYdsmQt>$BwPknCB(CVGFO=(KBeY@ffTrnVPgzh~q!0@)DYlcct4!c9Ri zYUhDd{`W9V08;N~jdNWzgYu$(Cv*Z6G5ck6;T~6K0AU3Y{ZhUSCffECdI`Q2cS>DRz!i`& znRKwi_m5fqrfQ;Bw*zRUsa3+Q5L+%6zn%J?iOOwp$fcs?kWfs3^TDKYvcnX(ZUq0f zGFMVg-=b>^+kVD$w9lJo+QNn_Y*7m+i*vW1*+MxrObRAUk@qCo+Ou=y@)QRmEo^P= z=ViY+T&CV%rWE}{fPgU1RhTJQIEx^t`A=j4Hp0!-v#DONAhD9OWsuO7mp387O(OUa zUGsyF10S?`N~%~Z3gqa7pMp^h;9>b=G2!hITWi1lf)CZo!iE6{5=4|jiidT0yANvD z`=wp4Ak`LXs$nEF$;FNYY;~Bf9h~cv3DX3~K63w9kmA|o?#*<*QBv<}vrC_`mt<^E zerJYh22qkREdOzlkODrWv9zcdRD3rPhy!k_RPt%!|*%1IVH*t9E#6VUEiZ^r$Jd4zg zVOzHHF%h_y1S!$rc9CRFKAPlaX-ukgs^=ZTm~FN{C2!ks@^@a!woR3ub#f6?Q)iQg zBD;Fhc&}nkm&b!jk-j;`Feq6lTL+|@^biPDk`ZTFQ`yj6hvB5ada(_Bvx{CV9sKQb zY;Ec5FeiRx@NR)%z!G-ftQzLLY+J>YgLYXK>OAsTVaF z!ywKgi8bqDfd??aL2CKbq{{<3w-Q7=!F=fuRkEMl9hY<1@{n~~eUW|Wh3blEu$ov= zG=RJZvpm8ddVna9{v#uuUzF%Mp$gm9dp16cJ7(q*ck!{7NQFewd}~jBR#mzI7VGP* z7kCm1_3SuDi$;RLka`t5WgfI)4lduUX7$4Y_!t*-#=V zikWu-1;DhZ=OHp?W^5DEsE3p<8zQ14M;WU~6^k;1X7l{d2It{7E7^y}%%g*eTV1h`2UV zR@G=o4YD@DpA?$U0Mo9Oc{+PT3y)V%S4loQd3JKLmn26j+TG+F3`1nc$1hKR-Tanu zaT9Twi~kS;^1D*$_0T3<>H8#6>m!Pt{R0+K0;-6v3!n{TndB}!CLX3pWsW=sBYL;W z4mu_OhAXi>dAV|9iWH)sDT1TCl>t>84?ZIcyhTV(q);wd0}Gv*#Sh8i2r@{gU00>6 zF<@_n<1rfOQ@6!!7<-voWaC9BtiiNgc$I2ibfeb~&9gx_4~BziaS+67DvVc%^B5_e zu%s-p8}fu@81M5KGK;AK!-d(}We`Rx;7c)+dVn{$U3@*YydSoF^J$rugbD71=STH; z>MIV6g(r*661jP}#m$%Jab*$pYPRN^R^vY0Ma<%Ze#2$>30{+&**;xPdix7(%je1z zij647zGi3=Mofo``8Gv`2~cUI70&{=`AG{uIm4N(m=l!w@@bS(FLW2!UUVe>iyA}z zDYP1$cLH21EJ_K)st<-x%mvBJqROxFUE05R zi722#T=E8(xS1KEiTLWTc4qM^JVq6<2ts$gyLZ;L8TGqBc;qgw3TeH-@`!dx&42!@ zuPgN)aQAH2w~TgZ7mZ#Roycw@DVMd!yU#r~+gb+tIaN9LTZLe2lZFMpz2!Nq3$&SKCC16OkI6xQ+aQfvK$sp@0=R(v z#C#hv4Ulq}d7Kpb4MPynGAQJh5QYF`Ozkn3zc2%@wgUas)Oiwlnj=zgJM6!Q8xRc8 z*3f9fXMYMfa$cD~{(zhzz}*T1VE&SYD(`XO?Y^A=*y75v>QzlOz6#>wEn2Ijxvbr? zFcf@T*m;(Vw%OPp0rwd|>Dv>`Q=Ei}P`G)vrh+V`T3Sn^R=w_*jF9A9Q2{K_1Pz}J zl8UtrMoXt$aQHJ21_y}dO#0_4#TC~64-3El+JR(F&H24G*B4bwySWVjy6zx73hI4s zs=mh^E1a|B&n?D1`c?`GZo3wARAH`I92f=u3>(}7!uup9H3YT)$vjQH&Aa3r-!gb@ z{%R3el}#Yt5jCb48=mx}FrEO8Z4U#ykBiMMhN$L3&b@fChX(cwo}vL+^*hEQ{<)N& zP)sq0nFCyl&vITM;NT4q-Ks6Ubt&^MY8fAzM5QcMi2TwY(z?36_r51ko0~Jq!4@H5 zpSHQT5LD#NWrANG#c)#Ry@#Y&X(w%LDjt$x$?^vVZ|np*v|Vr(S2UdgWaaZQEV^TKBet9Gzm zFPX%G6zbb`zEcYsDE9>*SGME&so)|^)X$b0Z936V;P@njFlSPLsZp6+rjqQ3f9?io z*An{4_1mOTp2n~Pr>F}K{kg1P5YZ%}EYaa^gVWe&m|EZ$F-!fAL>>ZY(;F#VaY{+F z5aU3s={N=4zMlb^C_YX|1eWpKU-sJPJu!)Te_ua@hlSe_MrlkBMT4HUTzZdJMo}9^ z^OG9UgP>q-1p$+@+xayZj&w^?8$jE_nxR5n|2&i*1T=m{5uy==n4l%DoubBoKsSa_44hGh6+hM$fA$_*C@0!t&8=rlNLQMQ&->K8 zb?n|1>(1Xoc9p)=6;0M^YyHF_5Q8NC`R@yv)GC}p=9rj9qHWriz^&B}{Rl{{Mzov< z87lX)E<=dh%MN}`rN+;0BmE`&))sB__7#|YvN=h;+Dav~g7dAZw0Rpp?+ek$e1Zi5 zCgBq3d!}60L?kp*vYs!}b}7?SajysjgOf?0yXEH=mNd~-k>GTHiVdh-7<$P>S)bleMwdJy-#;p(t`jvRg zh>}7GOs^YGWeaX|McKvMSi$Kz^SJ;71K=takYc}DGIRu4ya#c>ezgTgzY;LYAvPho z49@}yG^H#p%0><@kKt$S3Jm%<;&QbIn;uzoX(;R43bAF9=92FGeC1~mfW#I@E-auQ z*q#DdR{`n%#lL#?ELj_6&s)(FCzsCyj*bW%>Qum zvW>bteG-9&c*T>K5u2Vfh)e zQk!tk%kw@L5Pxwr8tth(pHnxYZts>tm*t2C>awOxLx58Z6O!}!oB-CImEN;w+Lp_f zH9be;TG5@j>D7O(Iwj68%If*NAU|N$_VrPHvI_y5AuHo^v!nqtv#f!38C4w4IcG2N zvlEJ&2J`pqvs63adGV7I&NayK+e_2tzD3{11qLZ50g*l+PJMU5^@1)-!fk(k2J*wb z#x~Tsq~hbdNC99ryI@rcr4M@fr(Fve-em?-2aM{&T<+Vfgq*6_{z)|#;JHf+R0y-& z4SaY-;foH}yB`;`H})?)bWP@Xx12vx&$-5=J4gX+u*#5(H=wgDF9xK)4g>ZADkyuMYIjZp69h4w?aSXZ{ROk~1Nko1A^UJQ%24@b ze2)^cx#oE<^MxPN>S$6zr24!pT2%*yi}|$>dKl8dEs`SwhaA)0;A(MHyg&X>fZ3J( zqc5=>%gLsn{Moz7)^;jnX_`wzo++_-Ve8q&2}#_Fia&5p4rb3#A^N#sSldn&cgp)0 zd_-a2tPN?icl^3U{QO`Z@Z5A+rAH?nhV6CSPkc>Ou_>2gbfH5OkBg1WgREEqxie+3 zrWD@8gW_9smUvN%c|28y5tSTfv8V~aoD&4#lZxUJECKyAEfr~JYIQYhB+d$*!dJSW zmATxB=0BZ#80p7a%%O0-c8HS;Z1q!ksKy%$^@c>5Jm-ykxu*{ja{+$u{xRX)Z~^wx z{^oBM8?x(nyA9HNxSXagRTAfUz5tTYmk=!x;moqQ(q`$tW>Tz{^AVVTXG`3Jvez+@ zh&myRTdk%e=lcs(Naz1~Ql65dJqK&U+!smxje4n-qYL?!eRJPGd#QR;lk*JYe%`HX z@(=4%u6(Na#V&JF-fb`4v6fUKWABUu)p{LlWl-L1r2Df2>XygU{Pa3LF9AKPBDI`H zP$KgK?_DCSgVa(vT%IlP8Kabhu#x;}urCz)%5vpgF>8dJWUX|e$>qt+-AdrDSF0s6 zD_9XvPgNlI?RkI{dS5ew&X8*F6-uGLX_hBG$BOd6B zJwo3RpbAR%UN#YcAizsM{=}u9;U2Qi&gCKTrGXdy0o*YM_(D*e zJzyK2W6H%2mM&+^8{|BDsXf?>0d?r2yd#vn%#~fcU9wmt>SqeK6v&HrZRX@{O(huB zrE!r33nYU^;(A@!o&cZ~B35+94h0b_G!O2u6AnIPXT8CxfOA*uDN(?r9WDP33R4oa z>3{yG$iSXKVS0`$76KT*S0mn17Aga<&JB>rAXd`)Y;9@|43lp6DUvq*2FV6t;801p$Z_Wm<0J(pMfbkO_N06rpqsLJ5Rg@Ln zi9F8=wOA_WRlO+u2I*-l+9*sabwQBxZ5v-JHVrZ0X`#L@E}%oYQad%%QFPrr>h znBMa$=-`#WIinv0zC>8q0yc#N6}@NLK3Q#9n#xoNanCvye*d`=#hha~iu^lTK&0Q_ zL;?AjsM)TsR1Stck}e|947o%qK%k_ZnnGae{S3_-1pspXiqN`50M-D<_kgRwsw*X6 zqCks1yq~v1Ob#^Yns4z~iNGYNY|T%?*%HQ8OKntdVKIF!d}_ZH0?_<@i@=sqsjE49J5w>) zHB&T3WR$*Vw5qoNF6-~Z!M1owfLo|cjiyl-I1aTW-{O#YydeY(#oMVMa@{gC9skGm z^O>4Nr%*NqK~!3+kWm~T+lwTIZg9jFO4|Na35#9;)t9&NOU?YgVAf1c^oO2!#m(74g3;! zD=t*qE!HjT)aObT@&Bjk|4}WAx^Dq!d_NEj2WA?`0ApdsCe9wXIw0;Iywihsd*F#p z+&yr0K%8I)W+a9qHyI#;iC}Q*K_vxMw{G3D_gv%eTnih=F9K#>B>gd}KK_wXBdT zZJtyX(|q;0RDo66LrfCO;;|6qCQWPBf7uvAemGtfbY#jIJ-=E>x$YSkCO^TWOpijE zcY@PDmML!!*bh~ufW-q4*czE5epGk4uTay*a2HJ`LE!ZU&IgiGLpk~aoAo9;{hr_H zD}-iC_ZFc9x7D!8RwaSxAX{4YEvD>1KY2>aE&bEt z(wKmfX%K;2F`Zt~t~nPxJ8cpPYs(7IYyDQU3t0A~=;#$2S92ANSXEda58x^+9$3o;GMVTdgtz1I54FFo-4BsipK`!kv+4$zm0g!cg~ z!@??J05as;Tv^asoIekh-&~;;aOwPwlI^UDLrn(&HciwQEROWgQ-IB~U=o4eHfzHQ zOiSBwt;x6LRE#uIPSRPPAzhb!KesMjgcTS$wok}mLBk)x9j6e6+as%%a67%!``pIJ zM1{7jrOflP2W7^1X3zY@SJV_?61;6Ql{w`2c`0H#E_2WOU}eD+wzFGWrE*au;{s?4 z`*>670jIY4R$*O)G6Z2lfpb=zUd3kwThx{pRQU&E=pM`m7M7-ST*gu$OikDPE<2<_ zDgG}5Cxyd0$F=L|E!Q1MeCqxL;amHAnjgss_*pdNnmEnH@BI56S$QrGzP`kd^3}=S znN8S;Cc@ej7eEJ~QkRw!f=bF^)OZJu{6^s0%6W@@4c+0tD7S7xCtQOh zW=^pSycSXJR>xSYeGQ(RbUW3JgP*?^%`r)&VM?aV-CBxJ{wW08k4>-RHyW*~ z;G6r!Rvurh7~Ilw<&97;s+M8$BmkgH=^3Iog7kuXV0Y#QRLDxDXa-!IhYX%3u7YkdLmYJ>Woz zktJ#1dJ5+C6$Kb@+BHCh!$R_x1oRH7zZRO$s+%ApOVPfxVAYZAP+d9CdcVvnAaQCE z$REDN|WP{(V(D1gP&c%^`2=BOlGCndDM5 zNy5Jb%Jz?jAy74_Z>vxSvt8vg&`Ik34bqd1QX~=WN?$PDuw9&4|L6ZU2pDQax|XTS zq;i&=zBrR(>@hU8=ku(Kb+NH&mmNlx4`}RIxC=Xuh_b;Y-iGU6xjkIOpR&U>0G5F=Ogh|olWg1QaLkPrt zGTG*$S7-LQO#6QP`?{bh?5`eqkI&R+t+un*7Pc(&o3yZo< z0LaQkuUu0=D7?qVQ)rurjJlC}FVs^=KrTScxlN1r8Bi&YWalhsT0-4zD+T^FwbJ`s zvR3Bc`LPoG%A(>$o`eSTbBciQXLQZUeD-@3@YkHib7d|1H*SqWz~6FdMl}GnqPL)J z1AWc9_^c~$2_UA!9**LZko`U1b9pW(`&pa9KBF*g+$Y3_WcYmo-4g%>8Q$hVeCd4g(Nk;wK4VIiCgE>ih2m=1~ z7xR2tBh*r>%RTZox2CyK^d0~~*?KFTDBMbFNNsSuKR-cUfaMK;e+o$o>~CE>*T?9W zirZYYYc&rrfJP)-mpShV3mX(i+YA4Ge=vm10Oq-3earoy|8q747wCO%8%%1Bzn^G@ z?c5THV1Krn5y(s_BUG&2;!l(TlKRQKNl!%7MFeTI5ox0k-M&Q{ISFGxh;9Yj_yJ~V zRQp>eaogI?Z6tMnaOzykd0zhZg}I3sS$=$5D+waZ=pi>5ag6md*&>cjDfl$ zQnvwXac*Tx!T!K{5~936*+<22+{6l|mRz2DkG3+RPaJCd0!KR%W9mIWag|}VZn@@J z)X#_`Q&@)SNq!2@&VINkJB=S@nd8+zy{sHv75G~gFlblSPYAAZD}d&?f0^{6GOPhm z$vz48#0*la-x!!)xnM9ML+-Y?azNn%0?5I0>z+B2MshAAi$gl4akFmSas{wx>HgAJ zBO$ewIh!9HJPG&>IwU519YaU56q}c&uL#334rAZA~*W2@HA}E!@yb-nOP{g;%m7{PGyRKRJl~ zf?yN^o3pwl;C2+nEQrhTGqJh3h&7tQ$`?A z-^Nr}9u)60DI%GcrZ0L(gqG)HANW82hyBE~n>51oo=dLvFH-ha>1C}5*aT%0a<8e{ zeru8QcVz#_k$$Ds4h{ScxsQj>F^U`^DfLs%Rs5!}WUv2{MGm_bk9Dzvx^qJ)GC%zdUY z$AANZ0o}>s;1-oZkJ>>5%Ms=?(4eu3%kwx4`)wGVt=wX{zEv+;TMky-GV1*Vj-mwZ zdxWI{TG%#NQ@oz#+cTb*zrhN7a;xlG zDVC*tt29B?^n{U?_xxA)7T`kOlNa1x0ck5o#4X*yagZ^KOrSz=$(_xd9}r2LtopVZ zft$Tm%9#(DeTjz`Or%P*mh5o_N^SkscjeBQuH0z!b>`s001n0oR3>(V_o*&n60Ec2 zud@6&{a0n8o!lqS>YKA9n-zU?Vnjz11iTAEcK%9EIkgNNKNUXTm#Oa!&1%Q_R!RWV z6lzEbIxPkUei!37`%L9;8(@hc*5%xKq(%UG*BlGyh+Rn6t$%FLwJ1xAx#A(@zpCWU z8?8C*Cr%AtiRHD|ud9+bm7iGps%cM^76~CF^155lk9u$ndv%ynJ3F-7L1@$Ll2Tm? zjoKV@EV=VyV^Ag+{f87PLGeU9QX6iFXJ76%@WnjvIlFWqwgJKi?`@N?d3_kCWNcM6 zz_oYdJ)|Dk8Y~}~q=eeF$4ebvupdk-OJr}dKS?Ls3{vk!A<(S(?C9r8#>e#zj% zlF*4~R{mB-U(}{v1-@#bpIjgZ-1Vx+Y~O+}$(gb(V!7+isW^IylDnGtE(REN}v>I_=^Ux5&3L~JDaM{9P{aroG;ntc`$Np3D zVa>O$w7@bS{^x(!zdXJbQK;N*bG|KoqbC4({xvxS=T;^1W=E2Nm)sP{y6`aYKgqw$mPlQyAQ zk)dAnDjk>!&At^(Ytya;`;V;!6>F$_5_mtmXYsr$)0K^CVgB;$=R%w_ec6-7)cO8k zi?rucMgg*$gUUSivll#wNq4*r^LgE&-9J1~DdawjUidS*oiYq?S10+gX|`dcLEk6# z-xuc#GVO9-rsk|*hFGXJy>$WcC)~U$7yL8LL78DUiRBb%0RGZLn(-* zU{SWp&VSi;=|*K+cWhv66tD@-OO)g_I~-K}e1!T+-1%XsB7y^DTMqojO|e9fyZkK8 zZejuV7;tx!*8ugAu9XS^^zs1!eu8B(NrLq5WDd(VYHK;I04U<;`v)a02m(}Rc|^pP zftqH%5EEoLxS&G=56%Mls^8drucv8vjPx-s$W+9Qw2UGNkoaN}#1si`SvRK`9g$yA zn~Ndf5G03Qt{oCle z`z%}|%=%+OU(j=2@P^G?JG`D;hH<7}B{*ULK zgM^B5D~!l7K5H3FfR(^xKM3(XS~)n^>a2(mFcWPT-d+j`I;jOBYfbv+R3H`q{00g1m1%c{WGtAZxw19Xd9h%(cH7#v*gVQWb*sC z_zIm%{DLo7?FpQ=uo}4#OPtEID-kvTMD*_CCEn-PKQuF?r9~pac@?4d$JY`-@1x7m zDr}H8WbEHedz&koO#2qtYu5bOoC{h8nB)9?&uYOM){=XSaH(bcd!XdUV^RJYgg;{Q z>@(3a=`g0DaR1(|&YqFU4Kz(`^-iUhtG+(S?9o)V%5#v~EsYRJo1c(~_A25+w9GjP z6F=FU)ymHe$kZSq9@`S8joV?}b|?A$5bB1I!Jo|u8b2u#r8nhb;u=4p_kS5eZZ_IMz z`@>E~-2D*V7g*7%9Oi3o!lx?%q8IQ|s5Fj^IO_}99W7Q1S`VO*_k+DYMLSiXM?&Rr zuRPerL=QD{H1x6~ z2Fw!F}okUU`B6&p;4D&xFt9kau*b`llZ z-T;TbhbEwtUWz%LCZIDPe!7d{sQfe?cLz(4io{%$u4n^Jy~V%iP7I=63&F21f>b`@ z;aa&HMaK^{e|&X_yp1z8rCViyA#v_zg`NNWUp5*k_nBvE&m|Gvf;MOa^`|!UJDHNO zMo?WaN>Uj0TYEI@w=URm6b=bmHSKL96*Bck@@Ie{3B&@UtUpG!md&?efX2x{+BE*@ zzc{z@eugN-euj$W9%P@P&zwM8!6kJYj6i;KY5~yO&>%>q);)5Y%$_;3A16P=V8m>2 z>+)%fT-pMfQL{`7C12_bkqAuT1h=XNbB+ z5s~v`6}u8tj2}a?!aU#NDweQ9EFWxnQCoqs3`yYHzN0J7*}CT^<(Y(| zGg>7k#7p*GN$kK_$ap&Eu5+1v{{!=3M$2S<{PsHv9~>v_DPW zDRa)8FU2UIOIs1W$8NxNe`XGTbDc=vvxs!HRr|ZJFbaQ9+QO~-*_cXDxNOJYk0{fS z1MAtk(l7rTmo3tmCkv!u1Zn)&5Y>n~XSv*i8AP?$(#>-Iid|t6KZ+887}KmmZ4Ofq zUu2?>Y7h`P%}^d>4+JjkwH|q{)?I@k!XJTXSXLKz`m!L%cMi4P7wF1tml55T2p!_g*p=#8rc(D*)%wVYdnC04J^cT(pqr%dWsu&q!*@GVHywMUt-T)-@OTn+1W=} z5-&jY#|v$0*|8}QVR6Mn*pZ=fAq?OL5Wl_I4JoPbDY((yz-xU43@Ar~{pIC(X%n7Z!(mQdd>1 z#TRhWm{jtKH_L;gml|Sg0HYVfd12;KoDM+RS%gPb^9%{@@?>=e(M!~tq_?`TxyS{e z-86y$6Rv|IgRUm>Ukr>?H14qbdR6 zZk^U2$es;Q<`nlv$6yVfe_8;EtH<2mZX2QI=QM=_XkGqkjkbfOZe7%rx9}GsUoPkT zrEc?$$@q|i3aO*g5oS=x`ALl?dRy+u#|YKC0n|&V#*9ihk;O^Oq(}F>1U!mXjgHFEm7Wv3tlXGKXmG0a4ivumFKdw*do@D!BY0; ztr9S?T#jzzv7Tx7b#TpP=7K zx&CX=f$n9e*=^4CmN~R{64wE;$D5>>PwgAoCFA#{s}Dbmk5UNl-~~iTa2AXZ2n6wX zh>bbA%xfdCK{vN5fYE5g?~6*nut<$!NO51R(aFCYZB&B!r& ztCP?K0n_>da3QsO-{l36?OLKbMwQxj71;b9<0T+4ld3|H)$SUMlsCI#d(=4*+Ns^9 z^Suaiz98Gbu~B44yErJ2$TNx)UFR0XYA$0fKAhf`UBt>-1#o}>;GPmJR7^=W=`S$^ zYp;n!Q@<0Rs5PcU!`S)>>rC*56QPO#(nx~g3gLE8o7xLZyq|DQ{|x6RaljABCH@2o zqVA7*V3*q-n8Du(0;%=p6>RxPk zwc>(A7wM(hs?^Kd)Qk|0G5GuBGSHP1-W%*!@A5<#1M%@Pa4lOi9Yre>af#|g08%K`h%a*@8k|q^s>XqD9U`62-xo`_+ z+BA}0##?048!_(HJyQTu13}Zf1@=ot?-RKkkPH5cxD_S@Lu_Y`3yadb4Q^j{^nz?- zC9avrQSuYr3ri@(Tbu*)H{1d6suI?4Siqi4hmH^t3oOr^n`fE0K(qcn_bk)#LMa+9 zT|9qh@mOn=0YE(=x$cZ_I@^W!$5I$B}V}mx04)R_R>f>f;3ySrK!kJ(r+OZlNhD{ zj+rH~qJ zZ}4DzMn7&}+tEc4&L?v6ZAcL8s>Klcp z5&(YoX!{=on}jXk)q&@e9h)J>H>XA^TyHF}U3R~1YIExdr5uRvqqcG?;Zc2~%r6!NO3CaeWU=y6Wf`f&{PT&*z7x22 zb1}iYnvW4R{=Wp?t1h%PDF$#HObv2<;+~s(ezyAUtvo%A08Mp`PW)a+!|7vQumvDU==6S}D(P zW)C`ev=|6WaQAS~jClh|2&FQJ>^q=XKC;jjJ=J(Ng-y2!{w1Q}4#9`-S5Ka#s0w5} z8}2E8TYFvJmD3EIuDU@-m@cYD+Xjupga7;=?=xK&2*YW+2rD@A!vfz&x2-4|A=^23 z#pt)OWJy`HtPM^wsh*H#O3~iW23Fawew(AMwn1z!({IUEwB^Fe#QeR_+i*<^# z^$#!6KKZRoO$39We>4HCODl}qWG$?6l*sRA)2J)E`gOqqx9{iggEwc6w_Fnxe!}EN z+Q+2Q8hiWV$mE4t5U3BJ8oyH;oi&y%UHU#u7)8bnM7$5{|`Qk4ba{lg8hO{*z z?E;nUON-WyvZ5n3C^f(`*ms9)}ooD)ItcJ3(<(#)jeZ}-5$5TO%vfzHe z0U2g1=t-J@jUf0eaZeD)j7$hlPb#pskRHyng^id++x30Q-Xcd)M1GG=RC&NcZ9r?u z0Cn;AS5sPQWOQhhwY;D8xyQHB%Zmbn8kL7k#WF#{=aWVRfl zuYj`wOeXHQIU;@q7`yi12X-1m_J97jCvXqR!h)NVtb&w75L2C)OfBX>Bi=Krs~TEx zV}pW2?!$!!aKjbe^MhDM520vX)Kz6IKzdcgpr-)X3XtN0zb00_?OIpPUQzy)&+09h zTteT6H|#)J*<}*)FI1N~!S-#2{OQ9}KWucoLO#Pd$9q=m&$wW)mkW`$90K~iF8v#a zduE!hIlIq=pe2kptpFzarutQ)9&n*rq%kO9&~bZ%-ZYnHZ`p0cvVLktBu~V~;^({H zswKdsumY0)@+8)tIJHC?t9@RA+Y8mm6z|#Edy*8~3p%V{geqkaa?CQAEno$38U3n} z;G9i;|6cYs|I~p3Sc_H33C_SLMrs8E_gx_Ap{W?pa5bNy&pp^$)@tx$THq9rZKq*n zW#s5Gg=|^NzKD=^Ot^$agpOUk1RR8YQ6o+1EkLGm9PhcM^jMq2Y}HF@QgY#G$%-!9 z;NsaI2@L!=NHGq7j`Zj6vzi~3IWweLS5h3UAd_)nis1%i#9?ufLgvBx^M&hoBRtA^ zpE9ff&{Pz-G(goC=Q17Fu4EzqYN`&AfZHMUrRp|7lLK;P@fV0n&B@Z~AjE=|UvG&x zy8y}U0YuJ!os{&pVBf^4ceYfek%-C@-)UV3KBS#%|K+CdK^2czjj?2WW<%HSAaEwr zNXaO72{Oo7ev|Ifq`h|wQ(y1JgEX}cdCroY4{EA6oa$IFcEdjd!Af+ys$~$;s0g$S zD!q$VvDsy8e1P&JqHh)1IQ8xt4%_Ft2DqeZGN|?ahbs_PLU~J>t)%mtS{n6|{27jd z39w>{bD*vjEOMcfW$}GWyLm^|Xk`>-YCxA!LoVbF4V-iGIvp86F-fq&4jfnC&v~P= z97{!j6X_ZSPA4Q2%=wLaqQsK0P3?zEGdGC8n+krqB&i~yF3?A$_gpZ??Na=78+MG1 zs*~EPwT}jbJ2*I>MQmC7bks%PC!c++6n9_cn;TXAuy?QLFx9g)Zq?y#j|O{yNTdEl z8lYIMD8gp0IsXsO29oaRpfiG-GK9l{(5@Oxu-(L!zb!$nMe;!hJ^agUyW;;jA#Xxw z%hufhOvQNn+otn_4L@kx!qsN^lLzBnVlxuv(M0kMWa2YMz0^UIe5g@9b)5pNzA6L> zSB1YzT8B5hTPv6(6?J}StX!tvTNM`qX)Iw*_m4Bb;KES3hP>DV-%bor3UB(2m}4oe|$zkS)0OAOy-#acb>r9_T%5@ z*|pE#S#1UQVMhFHB}NIN^HQ;&-&I+4tSMLU%NX>^qFG<14Dgmb zM}g%tViNdp(z>9+#NZYKEwnsStJErne%db02+E36SP~sG0GA9wv%f+4`J&2PGN{aF z*cV9!V3aV?GM{UcU?QKZaF5o~fVBXPsu5gU4vh4+Ekyh~&6fh`5Or%i!5-6qrla?x z56P5?{>QwFyRxuq~jEtj=DD$^Fi(1e1>_7fKz3$(QJ&a{{y=R2_&wt+n^OH9)ozUEt zA#;cdV6T?@BrDM3Q1z`|pLMyGRzUdVx8KmU3gnOm3ZdQ#8Ai#T zP1jovX)Y@orEOv*!M`!1xRH<+-j9=tT*DQX_|^Uyc}oO0SVAnDC?l1ZLdrS8D-vO>tm2s zthj*t@#QuvwkS`VOZo&_k}wzKRt-R|U`pR-_ZYpka(>6;;5)(Mf#G43NrVW|t!+%F zrba7a%bZ^6eiq9V05D|-NzrXY^K6(fovo#?k>=dWJX`g5Or>5jD82%(nu(@{m2eAj z;Zo@(=CK?qOOQ1`bZ4QbEUj6qSXNTd{shXCCEOdh;~(z(oi*o0zzTkg6DncNGbr;? zT(FqD%2^6#6f4gzvAXrS&Ck&-+%_4USifO)>-PyJwPh*@&7g1<)wm%?Z<_#s4quj2 z!2f!P|(KN$Hmiz(V}t#5}j1 zIgVK=r(&eu$4r8=ZCK%L?q{bs%~#Oy#9(d8G_j(k`O$`bU|ov14fj3o=i9QGpWNSx zDU;iCg8W8n00D3L%=S-q{ACGc)IG~3saJ(x*kEPM*BYF$Tuaa>w}JeCaY#b`iEvZH zJbzs=2%s-rQ}U<>%CiWkEDP^rfLPMSe@H6ylnilkCjPA8q^H9b9r*bkUUZCF*83zs z6cvHH%~cSkZec~@%73E)XesvD0+--?20}K_wi!z%_S^<(WIl%7Tr--Fr-}wwiT_@= z0n`8rDoX%9^gPpeOCkAf*u9-mEjroHAIp%2D>7VVRkHUnQplzOWY03R(BDdJ<+erM z>g5&00js=3Tq6KPQpi3S^h7@E`B)r?u(;SKOD2Sh9|?G~Ly%jR@-~q6 zJwNNG-oKqLkLHw(fV9}7sc1Qpdx4;*Mj_yk&a)iK6>V{Pfqh=pR(N@$g^ASs2tcf~ z>sE-hj^RB3m-}}$3TjKMD`kqA0<|b3)RcQ^tM4KUf!rx6pK8 zxwix$DuSizRJ)8+5mS{}hKNA+&Ip$Pkchxl9KcUPwnAouRW0k0rWw{2m4z63`H6$Y z!X$24wI^i;Q2o4+E$-_&RbtZ0Z$;CjE9D@Y8$I{&ls0_`;fmiZ(cfJCHD zU97ooczL)pEAgH`IXfz%K(WIl^phR3c!qnbwu{X7i-(yo2bpJZM1*OG;r?!#DLYRA zL#*lf%q7KZVNlN2r+~lU!6G08x&PbPrzu1jgkqqwfNR7^q>Fp z<@s4lKvG+=Pl9NUICXm>;C%CUNmCm|F`mCxdZ&yZK1mAG)-A4xbAR!tg%EY00% z{i0!Iq@Ly%m0LgMQGhN9(sB*u{prU^l!zlas-d=4w;0idrSd5-&A)6u}~4l zBygYA3W`=++C2mDihvb76N=9K*$SqV=q-(C%5@nc1%x@y-@Ek+Z3v!rK-MK!8u?k! z@Nuk~4(UzR)Kz+A4SiO0Wj>c$7l_pswn)IVPj2CP>&L45d(8S4R?-W8zRo8A;$v9v zbJ?EHcH|YbuDHj|!1^aCnPy*s+@A)`Al5T0hzYbB;oP$eMHVnQE;H_}QPNY8m1eNV`r%1te!EoN#P;fPFH*A>#J?f1#ne{Ctt$r9Oe zk@ZkpWY5@>o6iyEsaF&HT%)Wshu}8_Nb?hZxQ8v#mEhBq5if;=(GPvO6!W)*?z%3yGSuhIwv@YSe@b`0$Ut@u-EU+6sk}*>Wm&Y zkXXp}s;DNZkMcMBLxJauj48NcwF^|I2UVq~qm9?UvMUklYjsyW*zXBG6rvlOg)6m0 zI$?_mD4{^$beh)XGukZ~*+Bt+c{#&S!-u@tMOayN;eA%5uc}^lZ?dHBrtqXY(8~VW zze6#yI_?GGj2W96-9q>~C(!&KCpf=Q?_(Gfz8=f{TQV#||3o6zv zW6L~N2({!Yj(S(EI9E}{jE0N^V|WmlhCu%!jBhPb@>XSq@7Gv@(WS~sarjde3P{qP zjzu_PD&VHBNExYNn1~n>pjH5;QwXNb04rsc;~y~U1+q_EPRf2(E86-U%{k+93-c@! zDx9JtESk47(zXEx08p8>Ip6j3$Ab(iS}uA!4I0RcBbsDF~^J_kqv;j_?Gee-ae-)cl$vq;9!>=7!lf>?M*~`s|{p zw}B~{(Ew>`lIJ4Nj_MaJwXgP%c53mapg+DngD6; zR{A}`R4+pQ+&<8C)e9vwSku#?D<1Tgdyxi=h?AbP#alpG$=2WkjebGv7 z&dVXs{2WaUiro3Ts938=Z%?oIBhH;LTUY^-!6ElQ{|%V$`0r1vL^9^NtzTUS9Ey&3 z6*QKn53*$n_WUIb@&gVjgF`@dN zjd1)+u2SF{bUVF3e$q(4pq9ORK*JE~cC5F-@=rgD-OM)hebxdJR@~k#;^-@b_t`Q{ z{e*2Nr}K)K+NP}#Q)lGuFsO+@-cEnZvVPCd_XD$ruKV0Ju;<6ngB~$@&sT>_m&18t zw3pH?WO)303JYz!mvj#kq&SuSRvAgx=QGe&A^`8ns8Dq$AImG3sd!t~YN?R-ECKSe zG{2-}T0;23 zB1rFe%GFFQTxj!Fk2| zY)uXR1&p+)_h`|^Jwa_CZY>ZI_0~$u8UPBXa|Yb=<%uC?(Y-LvYT9ro@L45{ z&tHiy!1^9=`+cy=oX|P^R)}r5@8<;pl(z#5OY8&OGi?IyfsgizLZG)al*@G6O!(Xg zODNjTpkoTFa&T4YFZ_PbR@`rIV_73^Z)&wdt>%CH)c7;Jq=6hhxikPJ{Gra;4q}}* z0-YWq;vCZpy31RIOZ_B&>#|@awe$C-G$@k+%2?*X_k;m;pLMOK_L&g=csk2pLSjA} zYQY+X^h);_dlC>17&M;B;!( z8=ky%bMh>%56qgHgR9Tq*&cUtatldR!#VF6>PMR>th5Pw3u~(4kX*UFd9Gr0E(I8h zTrm~9T)>Zcin2*Ue+~WTeOP167v_?e&hG;`{*-lHpSdH_FI!0g*JGtWp;AP7`|r=g!tfa|L0IUIkY1v$f=ae;$%UcZ z!|;D_88Z;;XVGX~{VEEl7vh}D9~$4HnaYU!``VSbxdG;hHAVfGxS`sfB{5UpL}Hf6q=k>=nbQ zpIS^R!By>#5cK@#zh}>$R|S*SGPSIwb@7))ai{dV89s`kAJ!)}y-aN&(TM7)=S|1u zo-iseYfDS4FhUQ4x|QB%tOW%i`N1d|KZW@h1u9%O;Loeg87*)=#7?PC@1QR{kuZNbT4B43rzs#^Q{D^gq{y?hi}!?KOaEmP$0C?ng8GaJ88H^ zaLRrrLgWgtEV$xe9^3}x)GJwWYhV_!+-H+=XrZt|dCToAdu2a2<)Q~6YC8#vwj$W1 zRe*tc)@6}{;`EBVpe(*k@Dk7DjHx+T-4j^QCHw#TKb?w;_ZU`k{#3g7eAw%v;u;jT zoF4VStU;b#)zadoGIg%1b~=bbx@~y>rgYDR0;cpu51I(H#vVn+D>16e9*b}}GWXnr z_wULn4v7FoUeP(eg690RNBqJNZZj)o53#DfeV_FquhnY{x_<*OHH}*?6DVB#q@Lfr z(!#w30O95FKV^PJpx`}Rh;M0AY+qCY23oOh6#5w&K^c1j%Rkg(_w<4&-p9)kNo3Uv z@>YsB(xTv6{ulB=-5%0>a)8`;}$vClj@(h74KzvPaiMmKt$g4j7C zO@8w8G~PegCUWJOEFmtdu-o&wn%eS~Q`n6wp%K7_uhZXV@iX!rhMR)t1kZ<>vM zvcu%qmBT{Gw=R`Pj2c~u#k{EH&yA#7H3smM@&EkFVT_9ffBZL%zdqpeKf+Rm2&9k) z1-BmHglqZsq5f$4?wh=`V!-QW9y8G`0AhYQ0ovF2Mo2sxABf>wJ$O8(neFO1ch178 zKtde?4m|t3KCLZ8RGEwMb!CN3chp4#0 z?2*WauU_dt|4s1zEq}bBunK#Oz9OIu%O!)$t9DY3=zq#y5kNw@`2EhDOo-lIt6r5E zrSGkPZ5Ng~D+8-uIquF)S^J|Z1q_wQ?m(%Tt}AfP$`R)`$xQcK%rAQJ`AT@}e5;nS zd8Wo4dHv^DuEAr58FVxvza=1b59W8GYLQqC>q8!%JeGj|Fh=963DZPALS>Ywysk{teIvH#cq#9~~Ss!@bkzZ40i zx&Z>#z*6~A&?V{~4fcr!WT?a}|I#yCUP#kZwOd^JY|brg-@;FXM#L>#)r!DM<7bSs$Dz6PbGc4Z9u_aE%wWqxxQe^($#yS>f6=#&ga6a2Z>zn4@Ml*Z5sNv(sPJEgWh2O5Cnt$3Xtnr4_+q z<`Pc-w8r+lDm*y>irAn*WlOZMn=+b2O*W8=FSsv;qpj5*8N<%+COELZKKh%fercft zia}#&8~Q9)STWkG%^;+5A3<~PzlP4Q=I_lPVK?)U@o5xzp!lSqF z(`nS)C=K zA3fm)58Lyt-(oww`$rDoUO)G8^ONgmb$IidGkMC9g#qCI4~0boO$%TGV{w?&eJg}2 zFHoP{`Kth3g(o9C-;Cjf9SHN5-rLs`RThZvA-hHbP5F{9vk=5C2yr&g6YXhOiu;z*tzgWxfen@@Y*d4Fn zmqX6un8|1=Hcrh~ouv`*j``PHWcpB)Y-#U#i0N$?$kb-r0!%GximW*Qf13X0)|O!D z*~ED7R1iQDXhJw3d|^U*(B(ld55DbzmIqxXqz4@)1Rw_p$N>U!K!bH&oWYsxzf~0( ze8`BbBZG_5oXx`^E*0Sv8(LA>_!aP?!vb3uC;;PZIdJk7l$Q3n7=!+vaNfXK1B8s-?>aH90hip~`axv=C?y-$LKiMhpJ)iT0=ZCO?dQ zwwJGtv3WKyB|B#-GJAu#om5>CS7!?L@jbDa6}W98AeSTvI-z*EEq*@^>-$`WT&BZe zVG9n@3&`SBNZ3Mp@|dsO(JGiG7&Ba*A+spm1DCUpJ`w_%HL`yR=+X`R@tkajut_2J zz!YH))AKi6hpOc?&1jJVHt=VkNqtKgxuC60uOC}U8A%>*X{ptsV`yp%Kamy-NxQzE zOa)pR)K&o0RZ?8Kv&Uh`mDZ7Tq5WZ%CH4NTl6ud`_@yAQK#k-ILZ7jVb9fIlmbf_I z0;ajr1yF&~H|T`{$SA}F`aLG<@AHRGru*qyL4OAb_iQRIP34jl&HTW7i2Yn5!bUwl z!@i_lk^b|4B0vYcGSduo^KP_;=Lrputqzecma?dM`s{p}e29$VTnoqkWaEdE3hc$V0o zZ3#o^6}QK%aY58o7;qIIZ}+$F2^I>8EyxU#A0iTRu3%cBw?O=Cp|TYhLSoJ`{2Ax0 zo1zi+t?i3WG(8XOeUhY>&$ZhH!HdvUBRY~dCrSXkpXpSFkB3t%N71d{NMwPIrp%tQ z232v)p8TqA5lh5P2Z8!Zp{*^(azIU+2ZPld^i07Fj^`i@g0O~Kt18Y-I{H?f8USTdUEW)DLtnqZHm__om7-50JDKr&j;$ zUG8H7;WDKuZ*6mgvU|C;(xM$JoqNVq@9(d{Y*Aacz@@cC_W~%Js4F_X%03tKx3!Ji zb;EgdNi4Wkw=H^3Mx>TV--OTpriLOW;LVxG2K{`0@B0Ot}^ zme{jFZAY;Yb?)!LoP$gMEpEwCQmLiBEo6&tQQQSs+x@WptBy_rCA~<_vyD zkqu5K#VdlGGa-F^pW6u$`1zit_F$;EmcJLRbEVamtnH!E8}x0LVJZ8A;k4OUo$2p+ z3ufUx)(|If45d=u2q-H_FKzYIXiDHcTiulrnht`- zuk*B>vo+4&0*w1Df{FSZYuamV)6< zF$SU$HK0;c@T$VgRJMXV*HN4AC;8_o)`o-zG|oj^TZ`~q?ODj@0t#Q7dg< zg5rVzC=c37qE$3!+o{zs3RC7PsMITZpAF@IjDt~s(7=CvgvF}F4d@3(p^f)KdXO9v zc>yoGoji0^4v>*jb}r*1=XO_fpS=gzK+;0cu^x!0SF~UHZmRAzGJYTejW9eNXY-eb zm2pjNmMQe@C{ydui&Sk4)=M;0kDOq|9|$u7D5dp*k1Mp4DP(V!; z3U|agw(A2Jx5vv+t9_uEH|nv3*YT$Bx z$ufNhJxTVwM01lV7(8?|q=Y@b2L-^`B!PE98ZHFA4!<0-n(Ob>P-V@>pOByj2jgBo zL4DR6qgDyB4^buGtzsq2Y7}LD>KBg;*Rk$F_QP-RaMbEnp*^A%*6ta*QHjD2LN)`#><(m`p7(qZ)fp#viT&>=ylHsmu z*Hfqspz;)iERIWCnL43;6*H`(t-5}+;$hFtmixDGPOB)eqldY~?m z_AV#;sV4Rpyp4cAP~d>Oja>pE`JRL5KmYg5^F0olSF3Oj=UfP>E4s{_L8AO*Y~2Q9 zO((Ui?5{?H*&=>DL?}2+!9O-#U%3$oj+Io-l;1|-k_Cr~2IB5XQ19PG{J;w-@7eZ2 zjG$Iii7e0LVTy z$`oDJ?H@EE0j`)7oTao8p<6*E(3L^AMauwJ;m%X+2DKiQiXMmYTMS5yw4^*AHF0&!~G+{ zlX|NilxvA?=ktery)r;g*Do;^E^3j_1iy4To%y>~KJjg0*fFqzuCOF&GfICz1d2zmw;xmU9Nc9t6@L+D|T_tbo8$SVOzu&kgYV z!1pY+iCc)w{CX)c{_MhtYRtb&$ga8w8M(ieIrDkjmF#Cz!2(*K^QFAc0IA%w{e7l5JU89D`t@m*}-0pNRrOFbKZQ+{0lpb8_1_-!cogDQ|iH@}ArO zc9^^;5kw64WFPY?bi3CO%~i~_ljo)p@_)$eKFM@$`?psIl#P41Jt#&~5pHd1kh5ZH zI)mc`Mo)3w;h^_`Q@LZpl+qbt3`*_LH*MEk6 z!hhh(L|no9H;w5L%#i7cgza_1;2!g5;(;J^8^4kpq8E+;!QYShTs&{|iw`nLNIFbq zx-PU81&h)sIsPsCkO5iA*R~Q7`y-@6VD7dc64>F=i-t5p;eqN`+Z^b4~3~ zOp&M}a-J_VV0t<$eq%4#yoK}A@152M3iCG*IQJKAGu71h<^2#T8Ak$NWMz}cFT!R4U;l(7S1|$koJku`-In?kSo(P za=*ocn-xAB!u7CK&gR~5ZC+HZN>Q}Ij(~#X){LF2qExi5Px-XHDOJmyJPiwPGpzq zJ~WE^f~87W2C!C59sYn%sXYF8P02Mtasb97w)b40xk2B_X`(x7cmf7yzhKnXx&z}6|cL^uLyo>4N->?Fal!o~u zlH6Q<=D&Z_<0>moicakcP{jO|k0O%>@v3r22QH_mX8Xv?okkEbD!qUc@ddY&_XxHK zTeJBfu8=v#dH7=oy5~oRb=Kq+Piie|p!8|g>(UrYFyyGwS6RedtWdS?;%DSuazj;? znVS`Fp-LtQP!b+fs*w=C5#(m#o%U>j z5r&r_FUkq_Z?D^5rp&`-)me44ETHH$j}~EdjZ-V)cT_^x$CrUj)+=`zL;vIn-ilJZ zB>4Yh%ta9&Jq+>%28i;4w%RXsSPOveP*2E2r9H5t_1Xo7=hy#bm=1*vP`5?}Q>%jK zu&~})pVx)d8l_WAoR~S3LPAa!cYHtmVk5d)_7uF+9VODCWdo?yyvas=Qi2=j#u8JNu3$0ZVFs zYI(dLPKjKRc{NV+Z?1uQz}`7uswTBXO=>s!D=;`Z9ER9;^Pm5n6g7qS@s<_%_~LiM zJZ}Z<)ja~2=Sy-#+BAWEU&3u-1ymCAxrFuo++u)oKv@p8(yi^fwOe$!2i!vDn2?HFwfF`1)6lkc`g;*O3sQ4 z_w=BM$;p;!f#FjBmeTrUKAQvz5TI7utH4Yp1X6n+W&}Q)ExnI*)w3#HP7@(GXzda_ zZv|VlwnFUBZCB9WW7I9>_Z?OsfPD;2^t)q$;!pR+?!`XYW*kOXmHXEIf*%0rxQ(-z z4CeV+xYFZ zcH?%KuoCW*KZ(e^uL}r*vsgQoT$sz#p~=w{4vP{9N=9aRX3Hji$+r{`p~?hLe;} zGF`bf|E2=+%=7)FyjJPwk*Qz_KuN?rUkmat^vNxzH#L1hmfz3Y5mFg~=}XdbLdvyD zEpRT4sdhwuk9&#jrO7?Z3i_fhe3pz0>vC&p>?i;EzZ?JxUG-LwcJ{<&L)#i`MdW<$ zsJ#IF*g~TGhL0_TOQdhx5d#9fqxdtzDl6gVFR3mP<^<eeC9YwiUsnKO z32oU@@LOTQ^b(HOgWgMse}N2R+(4KlB}ux)SDx- zvE0h@fvBPGF`u_?0pPY}w9WtgU->0y+?qvSF)&#Rm;#)WxK}isGd)NHa7eB;C8uP# zHeh8-_Sq%l#y+nsB?*7l3}pgzVJhcp3IReFxKyrWH*&r`Ehqd5gSZmbt*s@Le|~1K zYo52#dz*`ba>W=`+siGx4dQLIan-3?2B(RUf>^&Qs(?Og0SGX~$p7;X#1>MT+z0;5 z7M5jfFI9{8gV4IY*4dg!mrQA69WQcTqA*GkKNpEhRkla^MX^!d&n{`kT~paIh?Oi_ zK(=oM^>)Dqw2Z4fXcEAprd-*A1_T3G0`dL4;90kVQA|19$IP9iCzo$$v zYwPu+(Nsm>-rDNPJ9=yGuqeGK-KuZH&nR@HC2`y3S}k>p{r4ogIUJY1 zAN;a4pnq#oepVl`+X}b-1%?y;yl^$H1(@)DVOEnr)3p4AT<4ZSsjJ%uM_x~_j6_RODRZSGHp%2nqK9M(OdKJ z#rcKy{%)+w{QW%%R-z;PPBF&4eHk$;n&+jNV*j13?+4)`s`uIVsqvY{Wg5RY3A1Nw z$teby2g^D6J$Azmt^C(YAcWs9LnF+23QJKBpr^Cq4!v}xQ2haRaRik5;ey=SP1xlv zo4i{93eXjih}#u_ICO11FpyjX3jtlS6V5gx8v1JbNBIDL=a+r()}@aX-=w5kntZkX z{`7rvr-E%^JU`?#tFQz^`Rz9NM=d1FI^my$S_xV<7~r4_fDH3dkrIC07njbVrguafbcg?2wH7}0MF2Z@AiZffXjoc&2I zd+~%S>J*`-?VVr?!73Rl3KON`Ionkm@y^xhu?2J_GTGlkW$l&aXf8|O8gW-OzvKDE?pWjd~Egs)Bv$2X3^zS@bT~s|`?S)*u zCbj?he@ENd>;XYr@82u*dti13!>UQ0zn(Ma#j_^J!C65Ug5p|hRhZ)YINFZcw@@!@ zYdTxehTBdhMeRz_=GRQh3_*horYgFAv<_*DZr=~B{QwdGNzxjg= zs3bka>ZjRj&RY?I)sk33?nP~d>sp-cOLu?ITT|JGH=kEQ$F;1B#`+|L3?m_=_rR#d zp6%`Bd={Zm88-3lxB<)7@9#?ng~C$4;vx<87bF6hTeZ^CLa=4Jf{@J@ zm}j9pFNNS2M#nz;Q<~c#+BSlY!H6`CS}qyppHgroOF(^T^4z{G0C>CKrjtD(Oe{fw z+B6*3@4d>|y68Dq(eG2_G$$fQK~2H!+u{`s?K3b6@$<#G6>ZVT7P4&?7xURb6>h)c zis@NG00Z<``~5O9p@4|;o+-s5%q(gZ#S;2P!%D3Y)@slAga}d`ruOGFV)=7~v8|2I z0-YgIE^k=?-%j6(yvK}u`CIK~fsUB4_x%J8l~jm0sEQG*ZfDN4(cfbx1O&`;QIlsi z1&2tA7FTedODz#X;KmgMwL~QMjiUDv4A+vG`pM@cQUGf!E-)}i8VoeG=fBRxhMJ*< zM@KC$ucY@2)zhx_K3C?7(|~EzB)K1Lys(9bZ~@2tVS*xrx#Z}Y>;O?|Me6Jhw!~}3 zl#2&lzd@g0opqo2owCm#H+-goPWpZmNW$NHc(blwvNbJiD@vSG_#?r_9<7MdN>z&O zx_>F@e5f~c&0T(2wTk$7dxUPD>^a-)dN(7c&YgS~A|@pJu(ZbpL7lV_2A7P(i!SV| zlQKXm^p7qCY*%6q8G4g&d=!e%o>M9f-A&-1W)iRZYi|6%(BjA z0iyl_2Q~lINkjdYZ@&)q*nkrWyDEtFd z3(5PNe}Qg3zvWn0Urs*AM`C?w42WuX40a$s16jOL-K8_krK?=M*(7d&Iw;_aVzjyV zIbHV%FQc_Q#@;%$3dIejS-^j&jVaas*c1308o8(3%ia?#$X$W4Sz6S~PN6THdRQgM zC+Ye+K}guXtJaU<1oY}QhbUc-mksixBT6a?JG@+t06TySc4HX~_P?rDVma*?W9?Ky zJ$4mVcAR=G&?YN~hXD$V#$E2P&9|6fNpB9gSXm6(r+0#oxol!R#quY1w{yzM_r+Ws z_^bf#S(suv^{vQpUS?5fRpX2N?pkP{T#97JJs$NDhP`?nUPGOUah%XbAC%*<#D6aFmi`gY{G%<@C^)l~DfkidP zqu?s!7wYrWNroGhW@r4o(w`@%`f)`;?-E;l#;;ScVTWtw7tnb{;6}tH|{(&X5PetGtVtt4sCw zmT1QTmKMDL=|fVG&GXcDIWlXgx~&%fz6Y zv|N&~;p6h|;%d43ORcd9cX|(q4f-N_5#%Z4zgPe{K48?Q!CqEpNK<5_w&Fvu0WM|@ za${kM@TK>Y+H{dN5coGvr%8TG7e-iZH6oJ8%gJyZG2%W-8g}^&f?mK*v^@{unpH>c zY-$aKivq@l^uo$F2v^)B%tFroyg|ogIjlGjw?URpu=3PJx~V2?qz)KzxOrAxUZ)JfNO28+rQio@d^pJPN5v0E*WHbau+a)T%TWbPV*K6 zBy3EQ#S7fdhSmEqXP&_db~wI<-5#DSd6u8;}^{gUj{9CB#2&fMsDZ+9=c%f91*yC8`kd9W*Su>d`ln)ti z?4mJ+f1QKuyJ2-}C)W_l*@DXZ%a>l6@Uk;SF_`z^bw|zs)#QnAUUaIX?p+>=;r=Ur z#WQAUYW>nF1g)2tR<$RQa5MJ}j~`lCkOgv|;MGr#q}jEt00{^{$y%sW!!zIfuHY+E zNKspae3Up(?N140!ykbpQZ{-}0PlLbPk_WPOGBM>*S&@DUIayAQNOK121zIh>&`E#OH3TBR1XBItY{bW0)(^&x_9TKc{rQ=JcnIR)Ce*HMtCwsnK z%iE#Do(7lT)Iaks9HJd%ao&oxO_%#6ex&rCg#>R}_*R*t>Bv1#JWuZD4VlhB47JD^ zCS@V%alT))%fHe2-M;`C)t;-s03!x~D|0M_-#@@BP^(ocunVDrn$(g+rXmQj^a7Bj za6T{g?DMUdIQ7a~QEiqrJhv9@qyV??-z!G5-z(^=!mZF}ME>5H?ZGj&0CUBVx4BPl z4?x#SAOfFj3Vn(B`%?I+u!RipmnKe%KTjZ7`7CUSi-Osk8Ueoog>)`zQ7ih%1%aRz zq8WfLe$S+N*;rbUvgk_{tQgcKg2GBzpprAaFDu7Gna*Qm$rKWlNozF)K(uDjidPgE z-uvHrrgBk+=`M<~X z=li+8M>(;|Of*Hh-(uZBeM1wvM+x&oYY5VBm1CBj4pN~NM@7znMMooYC~U!XE^5vt zO-+peZEp~2YN`Gfw{ttF%~=4heXd-*g)RKR4FI1jz?#onmNGjXN^Gyv=aBsf^iiT6r{)i}C&XX@i-m~X>F7@j`28#E)0YEyC%UV!t zJ@5XFeq>RLUYdOg1mIZ!7`dX`*u&%&Ytx=)02h}gSGow#q;aKy{nwERA~XSRX`%i6 z0rF>lJxx}^fH~PH7O1RbM7W1<6D!EXGWjgx9;wFtcE7642(DUk&zMmeT(YuhhpshM z(Pf46tm_^pqzD7y)_(V;wWZr2m*G}@c_uoJ3tosLLKa6b!EJ^Z6enFTdV$gl)u3OL z`<}PZNxkezI<+~2LTUgjbT1;Vu(kW=?Aea<18t;(GksB$&(sh=y`Psq#5yAUuuoll>m)5uWTL`wSTkIjf-? z@w|ke%YgU0A-snf(PU2U8Nq@7Mp+{C+4g;wzn~72o;}L)i8rA_qc=jJM10%&#e7x> zw(ZJ3o3dPRa!ler_l&H2uyiTr$UM^yx4=0gtcY+>Kx+99v-sSKpg)Oaybs!TD(DT! zWFE*v&cAZRD66dq7T+Uf+&|X86~1VOK9_WF?Xr0xa&WvrAy(==$(D%s^wNYJ?LMQx z>3VAcFhDPsX#xrKFNl)#_N}SRabIC8E?tq+_wk-B+(zHZ3L8-y5#Ht;j?x1$X6Q=4 z+b{v3Xj*|nj{qQoLBTS9m($VteLeeFjLAK36^vD1aSAR+zzTzALP38Wnq>{ie-Cyj z^z83Gtn(GlduF8-Vd7_%r3@y$(g=jf)%!#-3fo&FCmpToKBFDBftp%HD`Lge1Z9gH zes*0^jj-tHVGZx2_vr7*lq~|NB$d^zuxL&3ii^kp`QM>pnLeWt03p8)x&872O_KQTnSt!xKB7~2Vd z`Ho)&1aC2dx1IglWiJu@yNOB(qEz@B2*2AM1HIc0BY=jmR5&=(BL*Yn+~DUzI@Int zSGgtJ&k$tvN~=A{{#pTSudq0=`>is6E%AbjV>&)tm(+@Z$q-l}6>+t0S1f32QQ18{ z1DAV}V`vEiHY4}eEkT@mi`!Xg&fDhCx=(s4PC8rkGge*cLP1^PgfGq|dLWKKZmnu- zHODDnwV)!K|C8df9BSmVsH<%P+xMCKahaMuzvo=SRJiD^+q|U#SCzAzfZCtj`g>oT z%ukGJe(iYQaqHNM+^6z+YfM;R7MYnLv#b1SC2uqvP{B$$L z?~};lZNxbX`CQG}eP9K+XGs$P3HWzU9xsj~EbB^3DBPNI|NU)WR919Hi=Pn--3X2{~qpn zrZ5G)pWI@=pWCpNnvv4A*c$OdzsjrCzq<(NGID%>!R-((gQ-QzvRq5oB0c5LuFQn_ zb%%c&WCbw+xt1)1gZ`YtRJlepk~7*O5CbHMi9&$5)%V{!)1&R}Y#<`W&#~`MA1CB0;je9P?L2`)~jOxTjuM1mt`b?(;soA72)@ zBn1qll>>$V$xXzX^i|vGi3~!_sO;l;CJZ137HS}ZBWopScp?1*PYZ}4aZ5>G_ne7& zV%P5zBiu%c25ClD2O~|JJ~kFaZ`il}D=Wb^y$9ajTKzVR6z!b10oPj~YP2VrV?3{D z(yreU7A`8NYqpb$R{13#WQ6Q*#Zd7&pM`rhstXmz`sJ17Oc8CLKbD!#ryOxb!Y=Y|`Pkl?+4nlekSMG~m>~pCpq*J#CjnVIG z`u$9$m+o%^{%zM1A>ZhWM&nZ+0;!Re1lVlz*aY;%vM?u>o$_b9JtZ2LiBDWz^3@& z0!lh&2D|Rfsfe-;$R1~Z!I;$hsvEFq))EHYL~p^N0P3hy_pME_42|F(!;!hxPd^Z# z<){3#t7~?&Yq%CGP8>qnZ2ZGOOgVal=GIL=)S?f7mrQ=(cxNLHwn9+&g8f;;N+X3) z@+`xrVk;x=CIm+~yaSx%*z*GT#yUOaEtcc0+B!I{&;r@6P5n(bUvz#xh8$CFik3FY zLB@5PcfsvW2>=QrbDgc0s*?CrP2q;Cfsc*X{B(!%4=xtQ@IFX!z$&wjOg8CsovJSB zqzMwyJ+_6_i#M~IqIGe&eZdSIqVTe@I%Xb)J7}1v@R2Ee@&}`Hl067lgxBj7g(NlW zu}Bn71e7um@#P|ZvG_chO({Yglm<<^Dt&*uWY8Gg+b<0cze*kQ5I`^?9AcM^*t?9H z*xyHu1yb6oG|=7|=K2PWSYl_%r%9~Mw&=CxshlclYWg^Hl1aL*9^KEViQA~0FVJFE3{V!5h7P*Yfw<{r(Nnhpj z#?O*lITQ4&`-fmr#6t6#-+AtRFz(1c32&MG9!KThp8*a<7o2lLQxUYW0O+c5Oo!WKwl$K$mAax3uojGmzcJHjFfa*nCVReqIqWe(6yD z+Or>_vTCQ(bb^3T0AfFz#Jxnt+}2NhVe2Q2ri>&F1klQJXi(N775y68`0?iO2A%x6 zfpsU&b1SulEaT^Wgqec4G;6i8!kSR-u^7#>Gz;F+07-G73Ny_sd8r^ZAeHk6)LGNQ zG%UNeE#5LiG&Yuy^lDJt{~7XV16&;N@4$0b+%p32;p-ESD`>-2DNEG)d4-%ehtt`g zVJhr#LZkE%t=Z=>u5U&@Svs-&{y^Z66Z=ue?z6IqtXl`W^&3ZG`pIER%S z26$I-ZVGnUGPzms7n#T1bzdNAFYF1oi&eson<95&aiD?DeaW_^J>;lpv6j)ztk92e z=6+MyEW#&2J5A0uV5=Q;4MPr=8#t%5qP>%o?I5UXHYCOP9)a%wvOW`xDwTseiRc|W zLd<#}urWCj8jXC*^}$tV%8Xz8GW`6an$;p20|q#5j`P;sF&d zeubC1qJrL7O(h8q)p@f8+w?=y5H{OWHT zJp}S2g`tbvuDvWG=)4u0n374p#c3|gBDV0of1UUw|E|gxmh1;6fnyycNOBvca)(-XcIVFgVXbE+VTmkX@EZ12xZ7+=CwWy07*b zq#QrX-bBMP7*|=Q;XUa2t^ZtcWQ`*AQ++?LNdK_7H1PS7B7d`XHA?-xck34kT>Q#m zgr7||#cdND)%Vmumi3D7pZM)P3r4yrSV3jI-Tt)(6#x||`Po?Rtu1(cqG)Aq{pLlN zxsaV=?ItwB+U8B>#UKt9&_ zc~F3O%3EB?d_@Ua%La^jeAhK{3iBh`mlp#I1PtW!RTL;_i%E*UQ~QIv z{wr2mp?hT17h!Pk{rL+5*HSs35V<@?0m8RjEP`7`uw2yT=N7rk>5RDwUld@NMfN~grvvj zVBx=P#gs62ji?ZJW!c+URGSTGRfVMDBnxFWU_Lo8QSt9|PXDO`L$mT%aplA)-RQ%s zwbz0_UJH9=_B{S>IZH+})|w07>H6T>3sGGMB(>Nz%kI<`taCui*r4%x+%ECBHo6_o z?>=w%wsjX=N&V86yF$^MW%y2H{}=E1;rZyxLXgQxX`z0bH-Kq5j!V0QVb3L`$t)Mu z>)Oo}5qhh!M7nEFS22X&{ro&~=40LYIT0`%Eo!V_#p{iaO(R=DidB0m`&LSyo;6bj z|DP3HM<@P#*wT1^fr;#U?9qxueB!}1a|;0FvARU^AupOLp&+3mIUkVWKiSSr7(VqNK>38ML-8w6bz z0(s@G9HJ7~g8SI2?p4)p)JtAwkX%Q_a!aN4LoPu|5CBW7S$^}|3S{{n44+)M1lf3myU4z<jIeO&I9wR6$zzt_+lbt3`f?1USuk&K?;%ZUjOLG^hr0Lmm(zYJQ6kj7uV1tW|g9TpBRgOCCiokvai&+>|~^VUGT48)3iR zfE(gKw+~LXQ6;VX-<*g{JAgN=QY~t0lBZD%0s>upF5^P8H6oc-F}lBz1A=zsenfxl z)u4xKm;4*~HlYwvxMGYz?rE+nlzh0F;ORmkQDm7amVSEfO0pT(|A#=Il09bt2Emln zT4>J}EeoXb#Z;FnHO3Z^cfr*)id`fX4#O&kE?s;JW@%qw*6$Y!Jtn|-_48}+p#-0c zIyti8F2vfmtX`if>f;Q&IX^{)YZwp~DuPSBagQvbu$5`2`_JkDn1-_El7+JjY%WrG zfqv-F_ke#zl#Ww_kW3|PR2zcujm+Yys>p0nDCpE)!e;W^8T5MocNeNMu#mwq&H0og{w^RR> zG*)hi2e9(42YeVs@4+sfc%$6uv@w>U9X7;#RgnO%O-1E4- z`xTj}+!jyap%cbYY*-L0)YvkN1Lnca5_k#<_tkRR-X7O44rfM%xz|lBt;FGQyxTBc zNWCQA6qy5SnEM^Fda;!5pE$pJ$|^2(uF~s!{!rp!A=y1vMej19IVz!VO?3fFn7+zeQ(G0Q0^4n%R=HkqSxbq07-P@y zUc-T{>zXCtG;-URpDSzO1XS2?@oNA9Ir7fZdgB$&c$oX?9))PnFFuvw$G@?-i9&5D zg>3lbi{c)kY|)T~0ZZC0u$v1TKny z6K>I$E~4o^N6dh?eW&JClaG-lh3o1lX;#X^CzF5(L$O)lr!KRNS7;LdCBu-XU3M#y zT#$B_*Rm4zAn8D^J+q5%7Ue=xJII1q$d9!4mA3@Ubu%M;tpWfljeXWG>E23brpePo z))NfG`JFBeK(JTeUA)1VJ3I7oj9k4Q#bdU z+Ggbn(Jvnhr3NZ*nI5*^A0_17ie1kk^^zLalu)piDvLivzmk5jX*UFb%qWNU*^BCl z_O98AHndXsd!iFGAX0Hbws_8d%B#Bt7G%62t5-%wFGKOEL%g^=!2*d0A{s{x;f@au zL@!90r)ZEgrrKL|0|H4jt0ixu83Ma$3XA;b|Dd++v-)|kR$%PIyS-Jz&Dfmf~~U8u@(BsK;Hm{kD}dYw3DLLN$o$61Q|)n-s4g zpQ*&8hR|{fV&z*}6y_{?kixAk0OTrZiF#VL{R$$L^GX}3oKQA4;E zM8G*)$rd?{$aAJZh{K%Wu$FR|ns!pO+1Ce1YCA){I&ZW0^S4SWF76r7yoU_nq~VJ2 zwgd3IMVpZULar%&TV~+j?`-_?+6ff=!txT0^e}M#Zkf@bIY2=eyA;~RdE)d1%8bk6 z)@=|jn;aJB%rne9CT+r>-I~KNmiD*ft-Y&sXwGB8>8ZD&Nlsk=ZO)TrxCdKKwo;}O zrP_>Rf^%4(1BM99wvp%WEC7P;wWL)z1^C=3x^-U7C-)iij6cg^U6oNoYdgQ+Wr37~ z;_^~}54K#Xfi@)VZH9{~J8FA*-hwod)3;Iqmk534N81CZZbfy40KxNiQm+U^8;bkc zw@fkjSw);m&IM2|e4g(op$l6Ki1Zb0!B6(z4-H-tHT|PkqkMJ+z@(y)2`y`A^ZeC$ zb$0)p=n}+AT9e$yEpd8;`!`_Hm*{Oz{nzaxEruC=5P&?mb*coN!Kw;suJ3F)EtX)R zxp<+#V&`yCe?PmX$l{9{T=VepamUnJvxn!wLWz4mq=Uaf@Un9%?UlOoX%*fA6t39? z=%`??$jV%${0-z#swMDq_W>IU)|e$#%{7mBM2x!PpqZp`@=2 zSn_ex6*F!#x!p_ZI-ElrG)-yAA%P3LD2i=`z|>D}@15&Y0)t0j2Khcv$s(0Vy^d+t zqz+Qcw+yZ9RO@*`(An4e3!EzD-6}JjA2)UL1}dhi_9CTV`hDr0&dx-v>fH!}ba$~T za+`L^l(Q%za4jOKiu%OIG)*?_3I!M$@mweD3bnL4mTBBXyj6Yf;!7B7i~G z2zC@7yxtQ`A45zy)bL!?U6i&!(-1u&rR1pf4XuSidr66s=YuZ!d)}_$b2k}5$>HoT zrsz4l>n-HDBt0Vta}EFZxa`6%(q~(OVvS(r{pg1&(fdM&I<{s)ht4c-$xUExE|@H+ zRPtiaLkjBe)ywd93{Gt5?m>CtzdxV|)MpiFu#67P{o1AhGnc5+z9APg9dve8O`;`6 zYH*k=o>Wk@xCwf@U)H*a*Ndf$hF{k5gbuDX8iU zjGQI65bV^!bR246Pf;YETu;R|muR^L!5JkclNdNTk)osVbdwu!=?`Fz@0NkGL`8tn z-!&e2i%!nz_hVs%3?v2|Hp~a&8NB+9+2{op7AU+)tc4TjM;$1#yK>CNs1wWG!_>sPFH?GD~R)0 zu`+a!SYH-r(+U4u0=(ck^ma!Ym01z+gx_P##XZLe%7dsvMQ~yD^S`9Zg^Jd522@dN>biq^SRpR~EEW}zuYe6tpIVg83>y_Y1?<67uQ{bcK=7-o!oq5) zKZ{-@SBZJxE^8<(Q9MF-)_g1cH4@`(@%&+tABvU~bEk08IDiT*S6Zk`Jz8%Cm0?5X z100A-5K5vaQRY@%9NdM&+{nAHhT<#1`@Wk*Mfc*U`nWnvgp<(~mZ2%K+Uk9QpL9Zy zE0!e__aqn$tX0;}7#YU=i?I1nduz*qf2O|}C?9{UB4b(p%K$J|Jb*$SaLM9&8X#w{ zlp1RzaafmE_Qv{>w?9aOa(Ss5xCV4Qe3OA3e0Y{z^9~^cn^hSOA{t74F17**Uj;)h zZ}!g>E@7q%WDNlo&;i6*8R|)gn-x0?;=&^1t97GRW;vMm@vW? z^S>_&^iuA(Kw7>n)}mfkG>ByQI6wpdT;aBsnu1nCfIkJ8KBHB1Sq$IDN+P7PAuQq^ z08c=$zplde)O1oS!~j(pCB!X4L)5e@fKdoz!Idnd-<@<(m~9h@5CPNJvy`X#t@D1a z<*MO6y6+d4qJ|?-c16}KQ8cPvBIUa5pLFejDOi(IRsgrL^yg1)qrK?>Lb%Tk1mp8p zrf5X(6D>3auAN^80&lMy0t@m*R5C)Zn zn0o#G9fK48ebeL5)N&q!=bLj=X4yHoXKTt}6bsu(WcT*gk~upmjZ&8aMs499=iGk# z_r|y)e!_B@ku?Mj6oT!09Dt0#X};%5T!6J#4MV9R{JcWl=RL+q#@TFZ{EQ*V%GR*_ z`)IdZS43#268~1qalR5zgSv&Fch2~1!KgqO{8EN}u;fB60*G@Q?DKgifsbyPkUj$;aBHB|L?8a zo0C;QU|BSjtm4mKpM;BfYz2y+wQl2op7=e}n@X%b;fSl>p{N1Pz9IJ+eCDTrJ8f(C z;hrhnXPCNG6Fe_|WEo)<-%bF?0G>C0H;j)5pZpKU2S;5m0oc-@aGL~!BxDUVYQn&P z1b#|?zJCBQ!W8JvhLyy<0x`YiR0!EU1VLs~Sc*%yLL-b=x6HnYAUJ8>JpS?BXio*H z+?>i3dJxg!MzuLtwCX};aG$bvQQQjQGg#C<%OPscLN3IS5SVafS#y>J-XLY9IW^2N z>|a`upYOrc6o5!;qHYI5z(r+CBXe5wyvNa;X&Wat`vwS4E?Iw#x_|g9FkmMpUD)mE zY^t)#ui`)Nv75*~J3>ND+a>$&X;=ZnEdoROEJPSE{kaPFc~MxV5$!GEe3_!b(5lUR zj7$l)l=-{m%Kzv87EZigjJglVJ53Uqx~@vLe5%2ojI0cLKJg!UNYQe}@mcwXs^nUre*=={ks{_;0s zs{F%Q-PDqrZoEx~)Ud%fy-tj`DtB|nx`1;$-`#zq(zuG9h>k)wcbE#ic(+U&BS6NJ z%=5LE|Nx<97h1bwsF>d{YZZtcb8P7E;j8__qj<6}@ZXG0uKAL6w zUdtOmQo6U|^2?AAn~V7wv_g@?I*(+9q41)&lNHDtswl7-ZZzXAAQF=wt&3Dss1Wy1 zP%j2@^qftBUW8yR{A1o~r>_T;)=q6kh}Bhvz2(Wi=XIU91h$0^~>!ZaIr&4j65d+p_WQ`KO|wjEK3Zs5S=O=Mi!d?%$^w3QQ>pubv2x zSBT0~PR^E4UI!}ERelgHQb+U)QhC@>#ST6!>bn-ut6rC~&$3Gl_cWfpY&#Ya(~hI2 z{;-H*aY5kuNw!ag`{iY3stOKz)E5-F1N@2DQUF&O$L4=y=IjXaO)sTg)R!mcImfJR zzFMcIjB4dg*ZI_E23>NBpSOxZHy6OmZK=fZNS+(&l8BPknLDYpg=#Trxp5aJ!-X*tr zicD3}58;_ag%fHk_2|ne0g!rJ$t4EFO*ne-$L^(K4H_Y+v8EE+^o9(74!8x#n}G%r zU&ozItO|7FsW?-RlEs@bAGEgY!+Wg> z@zA!&3D!1Xq<-9eRP=NbrC!|9-sD{d`FynoOR~`J4vd*1oY$EJhV@u;X)MTsm91yo z&Wax}WrL}ULFs)-1)`=T_Ez27klsg=XRi9z>Doj4!2^cwCn4PfRLP!Tjb(M;6xlmJ zqGKz#rC)VXie#<*ajnvCK>kDsWsBDP*kD$0%Bq^Z={hT6uJaq%_Wn`5b1~R9;z<$x z;&wUGfs=uN5s?w$cbLY&nRd+uRoTo!0ItmUuS^$KcpI*Pv0t1l;-Y+LgjcJYP1WAf zkhO!$S%qc-Ccy{(nH5J~3NEOYJvw*^20M>G@#jDPJ6yPIV~SvCz`c#QFmzQA$fm{Z z00tnef3~eNd${JgW!>Y>)S`nSI0u^Jd$txNrU;W+_ST?cCMVejGrjGR$a`*O9xib3 zSNzQZN(u`|Lm;U^XY0qwA&`L-vmj_h>E$mWP^ew$)xl+0^Z611Oa6IgM!M&HlI+p= zpWeG-6aeIJ9D93}lEZ|F5Q}sn5f;MMoB$Xi4Q3nib~yD)(_!Hj?}=+_*+%GwH;q9@ zD=S7pLqqa&gMZK4j_NIwpwn53{)LupnesdXDgXfRR*00YhP|=^TId(QEZ)n?&xR1p z!>cVvy#{zDsu!9H(y+@pHkAODSwC^71Fd&t6G12&UjA5Z( z2^Mv=(g;)M3JBm|N*T=ZJWIRygQ78G5&+Of^mqF2Gm3i}yiEk#SRv#{(*xFp02j5E zu!IY)D@`x7XG!k-eV=TqjHGNkHj`0Y2tGJ;Y16>Y3lNuTbxbdGt z1F_yO8PHHTjPQzlD(f5GvdQ4L3-Gjw!gAYqSt7D7o8T-GG?zRoD6SCW^`I;xz>VDP zpfhR-#VvAyJ=lUOpICS;kh{E#a0&;-W_P&D#v0+t?)~X#R>h%T#5JjwE-7b)enKMu z@(N20SFqSs^g&djb`$@>pKb3^+x&O~)>@=I0&AO?766SmP5Q?{*wPYRuC)E?Sj&70 zSjE5Wlxi9|4+J_0YKw%DF}ZCceNVvgRqd%q>g1~}?*mjpftRo9u>STM{FL~jF%}jb z2;YEC!PVt_u2(503jlfskcf{9q$XlPiV{PNQABlZague)O-@x>dn-y}axgwQ4!v69 zzr#oYssToTyEN^X)IJ-MKEw%~sI|x-ZpnkXc2-k${E(dD?ew3a$(=WAW+D2f%|Udp zF3kDa^7N65C{}L5DjE}#?%w)3XN=X^psM~+GxSHBpm#JISqcT`QUI|i{G{BfzFyp# zb#CDp1vx5-d9?yUQ|W6R3DC4r5ogGu|1`_0a7vg6HVJI2T^zKT@i)s{-B-Wfe(v&j#^kAe_UQggQW z7rB!LTstufnZ94exHp70KAoJ2sy1+($`mG2&UCFM10$YoyZ|#>=GR#=2^;`+UW%Yv z!pSPHR_!tM4l;LcvRSE$!;J8xVxDSZ{k3fReyRc%KyiSogypkK_w>+Hu#KX8WunN{ zaB)|z?&YQrv(G+CF}7k~8iwor^}=Bc5w*TWxt`6ts`581;gJ8&#I=QnsKrVq%Ou3g z`bDIRWnBt4E+j09C>;v>0fE3ZtG1vQ2BnKHf3$^`db?UgZr6&`{^x&fV|m6!pO+Ei z)rqdQAbyoIh#=!9#}R>*8bo><2n|i0>%LEtnzT$hm2B8BInTHaqzurYdkMmH**Sy0 z2YZm|MFn6sy#z!@%M^m}?d9LCZC}+bx>i`9_}OR)O|f6}l3x{Q1*|z=xfpK;K2zk~ zhjXrE3YUr!)b|8w{}hMFC^I--QeK6q1;DVf^7*PPWCbP)?FxS>O&~&OYATwXw-FMY z-jpp+F_9jm{gWlcrO&$b0>^-xHU#8~<-V+uR{L`l?BB>yR<7--d&o^C(EBlfdOI}9 z@5et4ZVF{?NN%b#B)7jngYuB>eQut=$7e4=Fi?~M<@$ciHUc5r{5u9v2J&Ex_I9RM zz&;x}6|DrzF7T&Egn`BIhli_Z2nMEs8Ub?*{9Lh8u&C|bqBb&I__@_O(_7f$h}^AI zDwnnL@uCq93)yF2t)O))@1Nd0mrS6y%Ygzy>$5}|i(hxB?Imvobvr4zY~cr#A;;`2 z+MN3LM@+>?4lFQ%#VdD!7{@F8dj=P*b!QPpHBt5$ZDuJG3?rFW(Avcs79H-{guS{F zVxiX+1OeSEO)7NK@aNmN9dOwzgut+q)PJSQp((?!bbw*exMh;OT?ehMPQwe|yt`iH zLJ!A<`u#+0l4r~f+oVJOF%R|KtmzfXapXxKaU{zYnN#}+U##r6Ql|3~A^9UU5COlI zbMW-!<+qOV5iu?Gu^5)CQn9|2nf-LYO3ADHP`jxPs;t+8nqUi5_Oc0K*Yw)S>@0L} z=xXAF^Lvp86@g-Qp=5)}S{2*`0#CrDe?Qxsy8!I-4R2kR70YKX?rqvZz^Z`Xxf&6jJnAK00Z#ocL6hFGW3&xc%%OEmSDnN_)uEk@Ff6 zMhBl&`FwD|0RYw@Aee$w@MSBd2<_aQTL`%Vp*-E?`Lyk~_&BsLI@;i-YGZGL--XO| zfb-!tsio#-IB>#CxKx5mBSGCz&&Z-D-1rnYRnSLP@)_c@>$cm7xGpVjA?Fw|UiWOp|NI`tod#XW?j718QC=|a%*byt#BlA zbX}7C_K-wz%OmN;fEkPdW#f%eo-&K$noE1(M02R10@`&7G^sc5nJ^Dm;DQC0qXaRD z)E{w^^vi6Ih+fpXV*vflUsd^bo4Kzd3x4UIXf=$;?3)hJ8ocp7Z`OxN@d>c2)hzey zV}!RoC2KrIGT+q#C{Fnc)?Xz$a-vhcC-1QH%VE=e*IdXEt@+MVWOG{D!<(-D>*)mGVs?W87p&)F9Ac7l>gz6!_3E1vK0~2!??~ z{=JKyA7Z@B2azvsWCAPJa=|E414=+NNok<-z@%E7y?5h;Y1R}Og^#c5+5f8C-so+7 zl*+iP0;$90YV9X3LwkD6O;n5USp|Q%pm!ExQQSOB$k08}Xt+9mpH5@TpK@*5$TaY5 zej{1HL!R0U3nL%&7PL**BFAhcpqHB=QG2ImaG$=v65%9azn9Apfv5DxCQ%B^#He1S-ApKtkoe1 z)kjUUv)VY>4*b0k_%nC~tq%eluXOZGFW=wAG;vfvr(Pika*eR5^Www7+&JD-3|NGcW5*5@;U~{)DqSJt4ui;WvhTuuT|Z-bRo-Mul=q6Et_?zc2Kl9ze_ek zIxGBLO%j&&yke4*OU)m*Ah-||VFTP$q)>a=&cWmsVg%Wc41ttO#Y-UHluhxAkWImb z(fi8kI@1}b3U`rg4Wt67I+cDfH8k};K>`#bk(da$^hp%$_z`aGVp*n3%2(Bf`6I8x z<0MC2ZP6-{p}dIy!G&Q3Ic=VqAc;N?5g67Rtw_MGU>{P{`F0&qgdgM7st8QyG8*-|jqY?6)=ukV40B-Pu2EEVefS-NM)>0$>)>f zd8WPvYRIZ2D#$PA9V&&no4(J6ogCLvrz%a-uYbN#{7jPDZubHa;Vme}dID?_sJ-ZA zcX53pP-Ou%6r^GpT(}Ps!Ka_&*gWC<7`u>W25%N%AE;PXEQyQj7dN3qBdf|tD)MJ6 zLLqhuA>6krPJM0ggepAL)HXM?B~gn z?dASbcaf@}_b>e;?-JW3AFR)o(*>OQXTuJt{rAfS8DDL%G9=yAh6#KMN-$h$vl|F) z0sMo)@V@9Vg+8+T8_95=WMb#2@J;Yiq?b7`OhWgf!{0caTdTtUMhA0#eDLG_i-?R)IbGJxL0t<-IA)fVR1p$mBb zthee`ZHNWQh*P*ZwC=OZ&)eab&-U6ytFC)fZqqekZR+Q=Ny+a0t$EPZ@x`;~72ncQ z3DZ+vxeT6{Qr5Can0gybl4t(;ZC6|27MQK8zRduO;sT6d&I4f0PV!vpZ8IY8^LD1K z+}bDU$xoB5`CQOirnHj%e4|?F0%b^X56%9j{W~~(Ti`NL=9n_?ldV;MMbQZqU1=J+ z1q7C6uUNmG1#J+TZ3wv{f6yZ7ny1>MBS(Bo=gGWwwE!UXi1Y&#>1KTmj90Uo=qB z7wCTwfEnQTQEoYvSKR}I(;86U-qsej5$tc;p0h|rdzg3|&QB8HWrLnB!(|w5iw?K> zeUiRKL(9656+gGOmc=CknG^A!rgH#H$4cM6K;Iq#`6sXKvFnO-fB#;br3(&fK?s*% zB_?xL69&aa6i%0ijV-(6|Mmdx((B7dZ8I0>?+Z zQlG|PFJ2%Oh+K4?ens&pkN}ufsM0vkTh9AaCYsnrM7u{v6ZqLgP?BTpfq7?Bzlq!P z@@}0CZ}w@9-rcT^JvNK*l+!BA#T7 zB2fZUUW}a=C%aoS_GSZY0cD<}2dYf=SG>~g(3U2dBt>Z7w%PmxdT}VHkgMe_-r~hF zX;p8~Y^>vDvS7-k2q^7<+=7aGP_?~j>?yb_^en!vTGo0OAV$7tcr&VA?3Us;-OBDN z7;~$L>!{Wk*1FA#7FZbkoCMpfQ}mlVJ6G9|K@((sZ_=Jc{s7=vZWxN&Rb^mIg9X8^ zBM|Kh6thgrPOz0fWPNdg;9?3%|A^A{mfu&#{`w+dN8y+Uy;uqXcMX zD_{ZZf%vWe{J(WcYzGNi@0mSxh*V5NnC8OG%lOImmLVF`qq8AU8x6sfwkK``DT6L< zQrO-CQWH|fhynZdr4m%?_U+}?55Qj5mJ!~^{T`==ZHowK=VOM$vOAa$UO~i0Q8E$} zrmu^eCLmr@+>n(yi7*g?zFNuj|!|P1Xy>P#8r)Q7UYYPfy#Z}0^FJuyfsu(6*)As&$!?`x6Bjl z63`D9x(V3a>dP`12tg1AirD*dYW4D)_|;sD)8H#T$w{sSdwOk?3EA+PwRIM%kkg55f z|Bc%V$`Lcg2o5)m0J>d4Hq8xKZ!tNSlqn4PmiC#N;?@#DxNX!g7K9OoWo`9K!KO6m ztwjy|`Fk!qx5y38THCAWclowcxy>K0!Lxyu_b*FROudEFoN;dr>3rVk{FNL5M1WW% zw0o`~Ix@Di#}%aVGlU|7r_I#jd_OmpUolygULc*op9us+gT8IJDpxM8DzADGaSKcI z0ss(%LcezB;95z)C)j5pta#O?!H`R>az3ZLJih}EaDlTNv8*4?L*|eexCE0)dr`uC zUUFRsogPt}CpMFm%sF#{VQ(S}TmN$!lyFN!2Kjxi2kw=N88f|_zQ1*5MoeJ?rnfyv zGiEKbQbMHRf871%3F@BrBztfVjm){nQF^hDQ(hpXf_ogDk+-y@g*h>kc1^TGU1@cn zxUY}G%^AW1*5AdDVvu_I_cAG5b;#mtAYf`c2OI5ct^Lerufdqr_q#_n9{ z&-iu1-*mOvb>}iL7K;al-8^s6JT|8A} z94E8D&)UOQnQV>87!3pXB0`bKiT66||dvUv8JSY#b20iE4SFH#cfw zS?LKll8gsx;u~2e(h&|#a#Lt+KFcHg^#)3Q`Vm;M*API8%_}il5O6h|xrZQ9+o;vp z(%#!_ei-zRwX<^$9d)xdHimFWqbM_k>zoK%!4^G;+Dovqm|q8yDZ(lAkY0*ScL>Vok0;&`_ zgJs3OmRKUFJ)%5%59X-!CTIST1C$wM%)>sq=7E+pcL>Vj>c@XtRXV+7he!+{?C~oO zQU6nTu!vln5^jb)R-uyh!cm=4Az-z`kr5XrY$-wAyC9;6wAyn|b<_lb{r}oPSU|0G z;Li*JsEBx}N-iq=&;PccH4)2xg7Yeg0$0Q&H63_Mdva^(g1FR15|%3b~I3j}Ya#A}aKb_KoW>cP#gMOl#3(Xb7e64Jv_8w&ReNv z`AU22_D(|-rfRlPZ>MgVW`!HT=bp?z$s(N|5pS4I>?OCh)>XJHm$#j^sjUSA^ws-lK?Uip;r{-oEpWRp9K|Tby+rm*xvqo+ zAY{1LfYJe6YvKMS&{hAV73f-@x0-LkHC1Z@Mh7t~`fIIB=zebVK3@_Atp=SeI`Zb+ zqCsNohnlyr2b>g_sbK5>_kYltKUfK37KjhJG9z-5Gw zZ*%ZpGpnGgPUGH}K)?przX|hP1fLbxC+UZwtCzH`vBj(_Kz69wJ#b?q55RZhR-LL> z7ChdF_lN%$mwuPYzaSm>7P!-cbT+k!^v!gHN$u9SgvTsav3HseuvX%nKt4-@bR)}~ z@5W4rA25EKCGykR3!=O$9IRs_R~Ln?U{E|uNp-R7+x{|S93i8%Mi;vaCd}4+vmbum}s#Mmj4Pif;#e6*Fz^ZD?N#%8eZ>5sFc{s&FTA%eH*(Rq*MAlpmTPr+6p5_#1$U>fDo^u5#52VYFl^Wg{P!o>WD309~Il~0P)GPE$Q z(_9GIo!$ZzUt^f6IA1)phGbQ3@l&sWYs0xtA5mse;Vwg1t*mL#KQql#-vA<%RvF#Y z$sW{Ht0wh(t8hQ28g{axlliFLVn*D&Z-f4=J$Ny6k%E56(SqjIMHoSI@KLvtWqB_iC_5WO-eC<}(En z&iIrBjFiH2t5uM>AjH9ke-_lgm)Bayvo6iYYCX`E?x5=rz>iSv+&vvL*eSx<4-u6B zY@2ETwBM62J3?hn81Gj^^ekQ9JG)vz}3md}6w7YVo) zh2egDpIc-yhl$p0&7p}nd*t+PgIIzbGPaI2ahLbAMu7ai0^e4RBk+E|3L^y6fUfiq zCVv|Uj>YKJliQdSA`N{eNOOCgiZI9dyu?tfzYYX0oM);>^MC%o4p>t?#Ij^)EprM2 zG%2o(JhP(w!{v{$A&}=S$}6r5h!Xxb)U_yw=LR(u6Dugs%ka1L6wups1rZK(N%2yI z>na36y~XK;bsUpJu{LES1p{?SY0fLqvQ{o_C~a+h&m8MzW6x`}&sB{-ZyUY$8OVJb z>u2b6ruxckU?HcRvMho-UGFgvhLtR>oL4!wVA9H>hn-F79;60xinu+jzwa$%o)r?g z?B_B)CmRxEkUcSh7=52x`?#iVF9U9y=dWC*qq=1Qx*=u3=MpJw$`K+?+qx|dE8(Ey zzI%i=HgmA4~o?Q=WB-~7-ch8$|si}ti3Y~yW{ zmE038R|>#z8F4xcESCX=TX0PXHvc}!sP-9LVY;(DHD^jo|Mm#6EX1^2`+4h98fi#{ z#o^ny~c5Jt1H` zSnN0L^5aAy{cwT1k}|3tE86E%j@gIUXEr+MddtS+{Vv3aUy%U-Ys+NxD-ZnC6~qrOtNXHpE9OEP1gk6xbOV{-ifjzv*0eFBXo7>9`_Xf z&SVIjE(Z9bDp@-|LX3eq9h5_O;qTg+f?lT-YwA;UT47xR7(8iFDI4uN{7b3OZoF9p z6vN8ZVt_MS13F-4H*B;k2VKYwwJ5d>{^8chf55CbuRzS_1?-Q-31vVRJewy&8NVz2 z?p23i4M)@$3t;AY@c=VTzDgBcS1!9AF3Ni@1g%2|B;#Dj8l-+MfVL{a?dSYT8EPj7`UyJyX!dWB&g>#b^@xCgBN~aMMdYqs6cK`d+3MUnmjvvQPz21QYs$AjbQO1PYhjA1`31v4bwrH#1O^y^-NzVh+gvg1_t~a>UCrjyH$q;Sm)4Ou8BM znhR0;f?9rP%qLobEosYz;+6prBSOs_?H_j%hSd8*hL^_HpnP!^b+HmdV&5J88o0RO1?tX8-A@$SK}a^A~2z+aw8u3SlCNbGAM&s<^R% z^M&2tI!KxDg9Z{+ndVx68P*PI_W>V#EpV|#YK3E-WHz;HmFI6s*IlqgQN$J=Mx2Lg z;Q~3%v6j526luCi0U3)ElX3R2-y}?(A*k7lUjvb_;C9sThkckrFV=mlR&%^QiJ0^T zP^nbQ{EFT4^g>S|KZdiwux`}cQ!%^i2)}QfIc`@F?Ury5$;A%A*#y|*?o>JXgBquH z09+$bKx>}plDaNfG`zCFBrK*_F!A|pP=$iC+5iCn07*naR9E%1H726-(#*Y_-O+O^HrU)V zVS}kUl@xk#Xl>QeawnNPQ__dGe=0x+XkJn~ud}ycMEB*B|rNvMFJ(WOh+Y`d$>tG5*Lk zYebLbRW=XW`yE&Zs0Nn=*TtbHe2ZDbzjyqM7wz(8t)CBVVZEnYh7VU!);#+~eH8~o z@19$Hli4@lHoQ9=EX`$v&WQ@&kNraCe+gtW(J#s=NWLnnk&@ zYlkOrsV)>9QFWs{@i21n=vKsoWi}#{VSxU|5v&4N`K#Qy4nch!5?inT@e*kW#k9-j?0uc|ZY7U=bC_kKd=X+rcR4J*yDKg8fPpZ=@&?^y>O^k<6uSTL zl=2?IQ~)hG+2>`IaCvz2QMh3TmFG~K#~PHrTX;!=%6g@O5`f7)hA2+hx&hk}=|^SK zY{6Ggl^(wg7GEUUdiN9@Kf{g%f9lJ z{pEvmmvkR#QiKZ^g}o{4fRM4f7E2DIx`RA_lX>d2ob6P2IM(&~T+8x)*NB=SfF(v> z9c>dsa$N3Gtr9?amRK#9QJ?hXf!BfB@{`;Th}8MoJdEH)u8T6feL`TwhJVNv*yV!_ zN=2khS6*+qr1T6fg5|ZzN6Uz$D*M;Rrl<40VCQ$yhf^{ZiIUnZFJx%*y7!{dc!dJi zP6?1`+D-N!(#ngeaLRp;H2M`1)LR9a@`T+)Cgc=CFziBb3(AyO)!$p5f7Pqfz&I|RowB=g3zJJTC zXiAxHDp~s;4PY3a4QCxoZZI^4D@aM#6G9MVXuM|FW`zy zB;p=^^)Flx007e3Jzg>B?S;{oh=}b`&S1|}Xh(vazLm^l{J;M9i({lXs0GV1SD1q! zM$_zd8#_E)-Dk=BtUJ~yOlc$oG{GbVheFR(T_~u?E|+=pygK_6#KCWyvTg@1kfV8U zrD!yIUYslm+$Y{+_jFRbhxZr^TnJZ*-~zYp`5OXrcsrQ>8NKBc&0G=fzBqP9cJ*>2;Cs8tX*mUWNK!uW*262E8O z=hie}6mpxj&}t|yvV8ws6=DH$>-QaEo@9!M73(*htM0QHl3=KHS=rhmoD_oML}nOxmiU5Kh(KStU6k?Ap$Rzlhn&K(ZXTDo!mJ zy0_;i8E-L!Mn)le6{W0ElpkexMiw@e9ul`x_Td0V2e*@66n3jI+Ed?^xMzDYjS!p4 zxe(&UeYTuln6RtD+h(FnF(jEK(hQ7!2DLsLX<>5ZnxdAb=0NxJ7ad3M^HyIasUYcS zDS)x8r5QQxpn_#vTvxE|Jq%!R2$Un)ATUoB+5;@41?MDNfr2Tt{+@-OvBG&0M6;hc zmj!wABc!SO>#p3IVtGRxo4^t{-`e(ZGAC^~$A}9E07Dt2+D?R&EdZ=(Tv-7*HF#pJ z4;Jv}wS0z9rq~N2h_J=gt><+msy7|bq>?2>+U5B;`THw+4$L5xfhgIj-%M`vtaAUn zy=`mDz&_zsB^>75VoWGO&`AIveYGDfZy{C2(jw=)&(zS*_c)&^y)x=jj5zlZE`@+t z0cGL-G(X|~ja4IP`>Qgsgtn&5DEku|`u6skVjTHcBj{2&6Xa9kMP9tVmkyawIg9fQqRpBSlxW0 z{qfGliBVFV6veQC^l<8RmozBgDT?$EBoelv`H}wg(A*a-y5F(3v{aheOYIA3E4jup z9e2Y?yMWRjR{mi^Ty4vp;g?o30!wCn7VC+WyaCfzIE#S2E%?xe_o+MALv1)pYJ?j2! z6I4?s)_``zaBKdkt=1M%EReSAJe1g%AGgm(S$2e<{r@R4Uli97(8XsFRovlTNI~&T zu8=sYf*4c1UZw3Ggt$-jzGM~1cih{^)II2xwKaG~^Jn~4F^Ic*YA#_y*Pt+T$^@WZ~HMqDRhDk<=pHH3b*G; z=dtx(T1-4^&`g1#TU=IMAYRPb3$@oEPm1^!wE9c+`l#-SL2RKN2c<;&-{^qcf>T)2 z2e`Bp-!O*lA69-NBd|x>ee0;cd{H%p#i_+)SC7bHZadPfpb_A~osZU4UiPR-Fr;*owL#ztFC3gQwr$P!P04NT^}&KT!*) zdVaZrvhhjZV+<%&sU-Q0S%3y|#m?P1g0uoU)l zrja$ozfu=CozhuNaVz1w{foJJFQ{}W9Txw}6#|U6%O|Eo(YeQWWx_;NNZV0!3{wM> z=TY==8rAWr0s-%z*1No0L!SnRarp4WD1i@GECf4KL&MBx96vO~w`KknLH?Me$k@Qe z?CC52btLD&PFl(XS z=6U75Vaj!TQKPBm_rXiCMuby_K}eH|HW9QkjboWkuzw@SkP)W-ty&LA27xS^TljGb zB=?83;go9`mm$z0wMw*~;V9*HhB9kVhB&3X2if|qb_2_rwwnnoN1>1T6DQUspk69K zKr(@kN9olPK*~8+NoP8W6$J?(Y+LulW@k|je5RtQdt!mx`@9tD?cuTg{B_xSoA{Z= zsUQ{@7kD3Du$=8cQ#JA9!27We&2c9#F!RKe_Xq={4IupaImFsMe$R@ikOL(&D6SlF zYn3QW+=>g+2;$#{z4rkyDcwp>kIS9N45m^X^i)<{QI{1njfsS$h7ohFMc5z5j9goG zV^kP9O5r_MV4gMHskKsX)3pRc2=76t!H)sBt@$6}$_HG;JPu7)YBa@qi{H;eTTa-n zVd0{(TjzMH?5z-S;Vk4nbZ z30;xyO`hO$c$6W9N3Ybtk}Fg_=k_+_-LVxkpFgF4LA$}H5FaIqbD1DC`Ki-TNKaHeZJR;n1J zM!z_s*8vG$qZBk~o!CVP7;h8Ej6ew1*$6A|9zvfgm+Hy~Td-#g@Ul+DWL~&-3yArG z!>USX*FKY0U9#G;STu=#r;>HBdw)IOu8ReL)|ymT_clcJV=?%CC4gZZjx)UVCF zQ0Agi+8OR?XLHaMa<2F?8#VTn&-TWqZOXYf)Os?`&-=ABD2wkuAF9z$oB7* zc)t*Xax9uC-?8##vRt6<>)91dp-HvB5$TnP@EJw<_KO$r;FeOvZ`WEZ9IIS7XfCYv za1d$z%A)Tee1I&?#)g}iDYQ~cmj;jTtpOwY7J+K5gEGYw@Mr$nfG@W<3)BMJZ@ExA zS)$ZR5lg1ee3#Grk zMc_8(ZybUV=%B2t5Y)6o8WpUzG=&7cKsvTu0pHJZyCx|xJ{kAc72ULZ2oak`09`Ur zAUUz-)BwOXw>%$&mpOLf7GcdVeCgi?3iNAfU0S7XBh(LPBLuhbbI0bqZNWdn&wZTm zR%rwVS}iT?$wEejiMHw05(1d?ZEVm>gDZgy64X+jYGaDH4=jd*nh50-ZqYpvKsb#- zZBLtkS*uY3W)F}bf?F{mOiVF#@wsLG2>U$C5AhmS=vc{z!Zq)_hhgS5iaFXTX1b1j$l8 zi1xY2$4f+<^L-ZQc3@(pt$G6?70^JcR<3XNdnO7Q&WVj)E!?y$?V>q_m|?#n0`tU5 z{9FFDvxAD;r#w* zmY>4RRabBh;2(&&*ttMQE zfojEPPKn`O4Mu=)lT?XB!HxAO9k)#mssg^aF$-T{>7Lx+NNxf={ll%I;zy&k_vkVM z>e44k9`@L!Dm+P8dh4dXJ}Zey)%lppneg_`$SMYvsoPF`lG$Pe+pRsiBp<;7i~ zNHQFO;oYtOZ(3PYroPg9>KgD~%9K`~R04JfI6%3qi20?507F2$zpKn5!0`LBjzTi- z6+aU!+O{jI*(T&JLAX1!#JB&FLRP>XRfVqeOl)~oP8uqNwOt}>^j&(P zYOxN&{OA7|P_89S_w)itw9Mn*Q&!otXUIim3q*#ETR9Uqz?DfleL+`Bep?wSx+3?O z*`zrgh*)faS)RP7UL4+eTv-yTU6TwG3GQ`GujcQCJBq^;>$zgF(w4HI zQfoBs83}{ZB5H5SbkcD-IEWGafaLEg0f>b?>k~_ONwctE%CcmqVbBZ^tCkw{o{@&X zJj}^He~mDJ#hQ@&yoE&gMG$4z9MS=hd-mC+k{-oPV4h3VCKTeJc{QJ{?KMh&Cl>2% zv%b%Yj$96UI&Qpw&yhpMgspPQn(yBUQVV%1Rs;($ORv&bONL0p0dT&}Xo)b|G)I5| z0Ni5^J}c`IO5Ob|og*v)-69v-pUD$}7dbAtx#7a^IWCtlIQHF95=)9fi4qo|sP) z5WyvI7eZ67CH38@MuRd5&RnaXYUk;hETwQG0kBU$Yusmk2B=o?R$q6MyzH_M3U3Nr z_;|ybQmZEgcFzo3sBoSZ%1)3d+k*5XYRWSu$fmko|dJg6BcPpR>DzGWb`! zrV-mnoMxL577Vem*Z3spAqc*$ zk)K7FWdYmv87@2edjiyp;_%FmFe4y9CA@9avc?skXoHj^?axJD7lLtj>~bK*;w4dh zdyYezK8xXjNq|No1oz=IqCsDAg&svVRF~7jF91_NEx!8criD8-1eh`qhCrT~GsnpN zvx6p9h^~TKtx~_n_-nE}#ULW4Z#_3$G^~{@yQXjFct9FG$3zO58cJV0*5)itIUwnu zsZ07S0py3~QdzEQD?fQ$w-jtmY4=#8-(=i*u(vu~{4^{?&!u`0(D zu{BD)4}XB>O8A{t=TC@Mnv)Q`iU`eT*PYS)nS(suGZvX}I%%))tB`&Qg#@{wNiNP> z7Ap2xYYJymZ{KI5$e@$;@Ma! zmkqBYki!Pvkjs)^vXW#+#yDdBZ2#?hSBI%DoBVxzhcf7JH7b6#$ilw7Q6$tLRArBH zzbJYmuuy^ee)Mt`eG$+%FX(o4V$^xIR*KOM1gELXO%n1rhydnVyXVgr~C%t71z%#B#<}%xTV+HI_Exy?^}F} zm<=e-V<}Iz7zF#EdhX>5_V|Mbb_x4-stQ4+PFEbT#T2vVZIKWJ9OK_Ccuw2IzZWko~L6)c3>L?fm@UjLOx(48Jd} zG$sTDjS$_UjT0s&_8I+53N7X&6^=s8XZt6k--8X}&#q926@uQrb*85Jk2R$wX}VsW z3CbZT1nx@)7H}r%)#7(rW#@0(<7VJzqJY12s2lRFIqP#fI2WYPMEluA#ON)hzm=M( z=DfWpTQ;Md48xpVa&^}GagcOOEe>CjMpDlFWu;6_{f=kEPZ9=zvd{jCl~Mo~aG$pn zC>ojCb5WR_@SdFm;2!aNU|1+PNE<#k2biL#Yp>3p>8q{qvNz4ETJ#mB@w;f>Dxfdm zf--p$ZW}HQgSx3uz#=E3@OSfn>NF95g(iiSmd1IXQ8tqB{y9@AErPO^u;PxP+X<7y zbzurzFcbioVOUvZcpT)%S(v&7A6f7eVkuW@V6>kZ;#c{WM?iF+VZ+;8xGWc9NO3Kg zksp(~&%*uf{f#s8ui^>Ax>DSRutfWwKwBHdRl6q;Y=*V<=I`zMTwx2!p5G`9>QzLi zwv0ukI6ih^bsr7SoMw4%E(5^)dxo8(ILC*f=?etVFlHvySo9>pf zS;iKZA)+l^834~$6|KBN(0l$$ahuTygmx8>T=f9BC-QmgUmgPa+?wrhdO)4_tOFCHGEe1^f(%b8BTR-P{Uf zE2a!^{=O<1Sb;D(Op(B9RU@-UzTGzJ_ICWknC(v1&#U&fIcjSIO!u37Q*X-rwJFso zbnUT&_%oU|Ykn3iX`{+WzXamXRajF~?!Ax6DVOy&71u90H{h{T3#VwwdHxEuL35Kt zsqC|GMO>#5MsGo@IHez?ckT0qn2>a2lxe8HRpUY-#`DK>m0B_!o?XyY%dqab!d8~g z)_cOW8eUeWahq>PvF1q>w=HBIW1HqaLneInY#MK!YtfDG=X?qSg3bZ3r`LTvunMlCdY>`_D+=d}P7xckW&K&B+^Cx_+%q{N$>G#meRw|)|Uw)tX zNv^G=mili!3r*@=>7bF7r=S}CBymBG#u^%y`F;Ti=M0|W)y_qDM~utaFK1Gfq>Zo* z+6n@S_+#ST@DaLQE<0@oV+5XMs$&;sxN33ql^15{k{uR`#RHTu=s2{o2s13;s(Lqu zW8CY(Y~}DTg=ZJYJ}?k&5e>j{^C$pvFo4z#?JL4Gk-NvQHkFFLu~=0BJ%hoTlf=pH zt?UmdCx+atV`6!SABci8Vs_GpA#yt_4REYHk6IDnu;+KC(ER$|0n*v+{!FxX~ zr|!uTcuJ?>aVqV%k_;A=0csi9Yk*dSccrQJDvKw^D#WXM&UyVM5x}|S3F$T_PFPDI z6{EygVEB|_cx6`eHrgR-FU_4kZZC-V+=c(Uz# zTI)sy#_)Ap2C~JyXIt{EC-R5ma_hS=3PgdEA7UEKU0}F9M*s}Ilc^?EdeT$~? z12~SzsUR!|HC4!R#5q4-4_u)ZH6ll*20~>dz@#A!YccA*k2W1FHzg#ZJMsdI^0Pvc zvgAaIvm|jlfmKVcqqGUPjb6;5fnu;`MQ;b%s*$JrD~2KTJ9oohpRR@}qwc{lMJJL` zyWqL(;s3Z;69EWLvTak_ncsL-Tb0oHwIUF81qja1)>@PSXr5Q@g?ui7@N8&H&dGbQ zPi~4_Qd?KlhUWrMzgn|23b zZN?gv5h-uO)b>c$75)AC#2{e6&-yc0iZ&umaPc_#Rh#5JwLixtgz?}O5e;z0tc_8D z44Q@C&KQ|U@t(Y>&OBSeXd2O;#hzf;+rucXsb9PAxkLr7n_s2vzs&OjD1$I4LhJ5k6WdW&Rbx47s;v6NWbycI3YNC}rIgdC{#W$N29C}Rt8^L@D1 zZ(V7N7J~(F{+@d<;vQ@Sf*hJTt7X7#-UDlCfG*6(*<;34i>+b5ow{dLZg$`&E` zw@gF*rj($-b!}5G^lO|71D`8AZ_>P#XEx_DEJyd5&+fsSGHQDsPXIcIO9|jw>{-?AUt?3sz&@9L)*WSkgXFczgQYg@u)ZguO#PuF zz@5M#{6lpVeiBiWG`5_OffhU$+FC?~?S&BUV<@pwyMky1ArUvM>8)5bSyswe_B8RH z(e?^CURg;h#I*4CHVjRaUde%>+Hk2<%*qr8NN~$QP%8_Q zC!UYP28Z(t##TZ@Ay#QbljE14brFCK`oCy^fxiwl>VxsUt|d&aj=F{NX3eS{f&`bp zqh*dsVuc2a$v3KFoHZxb-|sw$LP!mIlPKDd=o3%md;&V7oaNErcvDKs4vN6@O(BHU7lj4mhDtm#>U+b>-XIRlCG1hoxW zNFFdH<*ypNi)z&1^;@W46G)8UwNp0*JwTb-!b|UF7f35TwmaIjnJ{3(O9#4sALa4y ziYX~$lX{DJBj8WW-}O>p=V;D6GxYQW9cg#0uG1pLNbI3y?*OZTO!-t9;42UP5E{WC zxn7E+dgubFu=ABC;5rdhSrx$xlg_(V9XK$k4%tK!ffzwRCj8I^ZYbqMtj0#4$x0$S zgl(h1AYZ*@f3|OqI?q8@T~y?>a0*$eVD2uA=xvG)bNfS?hHVm#Ou)^7SgJQ}+7&6a z!W3}BNz~%|wv(I7C-e>jW?I5%%jI3syGz%bCd9W@ms_~UvjyVuF*ra9YekG^rc`DT zB{S1#GM{aRg&t6Sm^=Y}ap5We1ifPE zcOrO$bCwS`^28N^_22x(u~_7;G%CeiL9i!Ho<4UQ1r~r&rld>N&vPR+;iPHV1|T4rP%Hg?OmOd+@QIpc^}Za;d3E0p*rsf zF~wkLkn2)!wFtWsAf$=P?;iA8#ID7siY!?LJb+LJXLch@F8UQDKVKuB&MJ{|6 zm8$+qTHf32fna#)?;Y0aH@YJW#C z;-gOLUlHALK!%R`)4TV}I&_RlIZ=FuNpBFAOoyURz^g2GfJy-lV)Slh2_(Zl^T#6n zJp{5hy8zIN$AOm&TCk&|E`-Ak?cb3vh>6Ahy;+vHS)ajVhNER=W+XJ5YE*zf%4ADS ze^Ps4{90=Dsj$y4j0xZYQ_rR|JHgAD(nHn@0bx9FPWFu!nyMZr;|sLFs4U_1hHV;Ph34$jQI{$XsyU zP3TbHT_ykdKdhHs>tzANx`V=|;P`yMU)50VIbNAbt=Xt=Cjt$PWbGbAaLAxm(fm+R z?eU_53&=t#~YgvhzJhz-)GMnbJqnynr=a|$*UoJ)hL1fVb^4FUHA)87~0zMo5i z0kK8gbLA7)G;wi;f#T-30T6mzG^JfQ|2~STL`$H8lhOmvr z|JZBxcoRsHa&L#T&fY*&5hsLO~y`Nie(1G|pxT;Z5TiIiaY2e%Y02$Ww zXjO(y5ieT{1O#gJ3qUWI==k4z5}r(=9>ID~T=%R}BM>g#6F5tx=P{VN^)C)_c#Hk% zZT#GzfcMa$>|)@PB?+fc-!$K6sEC8oEmQZ{LP&2X%&$NOX_&nrlb_h&zZ`X@WKrMG zl@at65%fi`OMASv<&X&cOo)_CG612M?fFcof6gIpUxi>WCjgn>gNeQs0I2t@%hZ2| z7ISyHfb5c1{>IjZJ!d`$8LZ#xJd&W6bBlvF0f=n2ihw7-kB#hh9a3K11usFSALIoa zgd$X`PE{xfWPs}Bb_ToVm;g)HjzBZ8gJcmf9hH;KWq^jG)C~~LDy2x+%9x!VNdp}x>j}Ay`@S-^g zt?R%doV#Z2OB$?I`3b500o1wPC5@b--lN4*s2n{>8n~>=SOTXGpTw*Xj`uSnlTGQp zrhWFv@vY?B#%{e4vqBxf{NV{`4Fl6j688pKlPI%3y%nP7vygfpYlgSDEz&o?(+rAH zLxmVf@)qGeCqLg2SeJ1JOCb*P!SWyym@soDCNUM(V)@_TjzgVnWHl?11iq*s=E@Z} z>%5oOwRxt9(2WmEZOW;^5T}^d)s7E9#3aOsy}%IpwOlkhn(SF2@f&2d*M#ergH+nZ zA&j)6eWsSwv3ZWJq}BQ~EU|n)TE`cr9S9XBKjZiu=!08J&C;rmb>)$9B(hBY(Q7$S z6r2z3;u_I!i$nB{x7#ZeZ!O1HA|N*fB$GG7+eQEO0Js+wPMOegK&9h2?LTXfEHF$i zR=zFr1Sbpd@R$YmyDAoko~Tu=Wy(A*_S-WDAtFEb1GzwdxEV~v-m8}q4;F3M{0%qX z-i@NH>ND9>N{apFU2pk{QOeF5TT-yx(^afUj*<;9ASyWg#T3ixb!9Ww^_BIG4tEOnKilIZJ#{ItqwH^05!81^6?mU5Q}{h@{dH@cQK z@l7_)a<7jB(hI;%Kr+yTclU11^LL@Ygt{K$`Fr3nf=^e!pg{ZCgy4#3g!800XI4 zM0#lymV}~s^DqhdwpnlSCkib5`vcD*0;A`Z&w?@bbJhTZ-h_MJ z$1t?dB4zj(##!iA{jr$H%GS9w0cts7ZS7WoT7u>8v*J`3RL~bD1pQ*-jAA9{VZ#dD zxW|{3^Lq@bLE6DjY1AdALjWK*(eGtl&0fSWC^x z7cUDg#2*jnwELHWRv2=rZEuZ&IY8qI z%ab7P-NruNT5dt)5=Pq=YMxgh0*I@H;D36s7aA?kePV;cq8&4eU|Jf0sA&V|OId(f z-(zO~ockbFEyMNs*V|aBw+K|OtQExz%7iQb=Kn2*1iydFGL|6C8VJNPwMYR7lc}k+q7Jy<1u? z>lVuI-x7TReGkfsu=P_o868jE11|dl0v)3Bo~7b4p!%ZaEh_+^<+e;<^IUP(XtSS1Bd}~m*k2#P&+T<^uBb(gQn&w}xFdsfBD0pizXReuyL~oP>_!lV z4$^dzXI=l^Un%4KH4s|F{TtWHA`t#UkqKByv_C%b`={S-r-8P!Q~>aHjnB`x!1e|d z7h$#zzR&2lC=fvX-A|xBZ)I!G8ozL!61q29fi$?a=RRMVei8!;q=lwPygXR&7qLi) z&;=`|T(9T?p^@D|sC%YpY#V!oETCN0Z|tr8z?OKGeH?K?3#mP?Gz5X%uUDMUE6$nz zFN+fo?)d6$w0g(@5v!9b`(VM%)?;Q-i#t(R<6Wc>uNhUROzk`q)%7krJl;>xES>^4 z9~6JG)(iOjx{GQxbM6MOU*$TDbZf zWOX&mXs^g;%8Y<;xeXCX&i3dws3uF0Ex)V%^TMCOw8^wK7EFVG zaPloM013vV|8M`Vqbc(@sBy_!>}Px?KZzDsdQji*PHPY{5$#l8`fNEhd`7eroC&EB zwzq&e4p4kgR*BNaJ*i6tR$~KV*h}Dkw*JLXiR=TcTq9%;j9(^bdd}~pO+{vangG%| zEprO4*iBtgNw$N?6>usRb%}_8{dt*l>0|!hsZ8azJI(=@!{cA}n`YU4I~a&GeeAOw z@YvtO%6vYDuKVOgof{$b>g-gRNSMl{g2uAn=kFxk=X-`a-{RCNRN^*n|N6_#b72ac zUzPiR`JeQ%Z)fFDt|@4ps@42hvW#%geQ-&QL;&yStrWK7W!)la-3RMmwr;s~zJ*B@ z@v7k8!Y?PCU%bYh)vuBdB24QQ*te%+AY|zI5JO$!v$s<_65)nX%T29YLBgP!V1DLu zs0@FJekkiAN2nVV7=v=oxJ#1y$HX!T#I-pnf8iSy~HB18FV zUxQ$1z!YKpe`+t5^@vF%fLGq*pn?L_B6}r#0LlJB`vZ#nVXrPcx9c zs8m^+g?#6j9PkPc-sw_(_W~z>r3Qg){shl-3)I3Qc#D#X+l4cPk|QAyg%%jcqCQ zX(_JS+ErG<>jgxYAW%d9isufg*iUI({LlZE`3yA;+yb|LlX=Y4^lja+GRI5}DhAqp z%Zzvx!i)lU0L2zPA8ci7dhjuqt}r>dV8PrPp8d}E*i8C2V$O239nC(3Lk)|!Z_^qw z+TPj%R%Y#dz6YRh36TuT?iJ+JqPENxr?U-rIMMc!mYMvfE<(xAAL1>IKn43lDNkHM zG=;4|;QpNW^n|DpEtpIV>FufXM(<`==XnN9R4(IPnC`|2{CAqA7HY|C{#7Z1eriE1qXw#MVbHm;M z<&}WRnP>Qer8u_|iI*mM#ZxfFm!PupyqhgMu`cjfy{W8m$ z2SlwAqACa>IxV*hlee1I&8^K@7EV;9JR}Z7{#Hnuu8>)GInE!ctsGX9Vy(O_N7s{( zAttsw3F<<7aJIO$+@-xSwru+2EMYDrvC@^_&NBh9ulFT6sz%J^yc~tBS|%vLhe+7x zMyRMV(XKL#F`nRoswq_ce(nzdv3AB2e$h?0#IY4|8P(+y5jRJ2(gD|$sGDPVg2=PDj=n#%tzdk#8ZPNh{|*zb z5!Vm>B=wS|ib2UMjAZy+Ad=?GsDg4Eg>o8gnP48ltFJ8`>Hbq_aN@n0NZ*W2Bq+9* z74X&~D8BumwIT*gEKQy8cPKpk{fdsB#hE5i0)mBR3Q0yl%K(d5f*@{EV;skWTeD@597=(o)eTpS(ggU)~c z_it3CHN#(`PD;*g*nTEt@Aki8E8NF#36^D~Jnws!n+~w4)TW)VU~&aahnxsL91zpa zu^0&>5hEf27`m-{5{n!qa>H%B(yz@QB)&(J&n4x-`MGs92Ai?j zFItOQltUP29Us8KGu{IXK34oYf)hU9-#7$t_$(9vA#KgEOY;889%4m8`ur{Ik2xts z%4Hi8zGr?t@xD*JPp-;%#!1fZK>&@=#y!7juLdKUT$wd4q>vRt)=dR`KSKpc=|+`d zgvvb$c#r4=&w$)8B)E*U<*EFV;-ATix1+GWI%Ax-tiAz%H&9Ex9YF9uj8I8k*(c&H zI_%4IhP4shFc&VQH+6d&-{*``pb=<9jqc+p;Z_KLwqJ-tBs#xu-wL*fA*kuVEQ41m zU>ceLZD@LH8HjtXm@Ip_&mxQ#im>MrWNP;edmtjz*4A4=nFtXL!vFj)qUoN1X6x^P z1=sBrsBhN|6~=Au*)U-W5eC}!FLLm_fw*0NpO>t+Ad}n9K9_){De~&bM4%h_`6$rf^tJHd}xLYMvw%+~PY0Qy&?FA0+C&z&cHpQY3Erp_0w zg|PdWre&7->a%s}zQ-X(ZfJTD`td3>aXhN`1mKX^v-83K`~O-?%|Sq?elbXf+X3%$ zE8;5UXc47`D9nDgOjs@xw?WY09&bTe1Qjq^MfZU3UQ=tK^xLPWSf`hlRB5Hp(Dn28b007-{A}G1c<3fzK!zrhYNsAR!ryyl3yK&J<|>s#&gNkr4WBEqfJ$^hf{nD7LAp7 z3ui`5%G&zfV`vL-@$R9|FqElGDoi35!|m^LpDbMaz~D;WXM%d`muh;J9G^Kb8G1D* z?mY<*t){Y%Ek^Nn?;;)aJ-3z$e7+~46|StG^8y&8zFk(P77)3Ctkl+!+vi$|vJ7jZ zxCwVOT-8Fcn=UU%=(kZ3;Mae{&Xj_UL??b;MR3vJZ6g1LKrq(P2XJNT7uHH`e0D|A zIhowL#ulPU`N}-h5{&)`lr?XC^|JvlH`v_vFNO#J`6&21TJOym=XmjUQSYJAfD=}t zFpgM2Dsmaj-iOQ;RYmTGX|W1i;AvzX`o;ondYzk-4L9z?SRc5~Tx+9;gW_*{#=~{E zpD; zfV!{%o|sii2%E8+lvhQ9$cFXqSyw#>jZ6w>)oQGF#f; zbmg1;AqYrOC#LW<=voBWfnftgYO9psDYFNz^vNzcyZdEO3qk-2xDz%`3T0Fh5Km=m z#*a7G%f*E2`YBV%ar*wH%(hb5y-Q>ng*hDnt5xP{D?2t2u_7fqsMHur6yrLAoUHofGnV@zd{DX68rKf=C78aaEdgchnecq{~Eb1|Ys3>sSwOJr8X z_zvffQD)`&l2ld>9Gr~;WRDB}uLp}Pvq;?q%g_#77xMCb0Roq;S{@co&-p zzJuf|OX<85Wc9{HLp5cZg{^r69EXN7rVJ%z@0ppE;bj^q3k$YW1E)~M>H+A^?h2k& zd;nO`r#lD){DOxcx1Fm&id*N+uZ10Myg>>4X0Z_ju>RcPzmtgTzY}fE(0CQ;hc)os zOlfbb2RuTAR1tGFY)@W4e@3cAnOU$9%yPjO-TEjLa5k!$E;3u=h&q2YUWnm(BP}wy zDD)O`M0yzVoDy(3un9miynujO8qJx+R%-+3XeJj6Fw2`#-PH-zP)dOCKKD*jjOUoO z;9@}BMWsT0#Jg$0G;V$r0;OyL*>DDx{LH?iq0jMzf2^EY_^I`unP){CV7~M7M*-lj z&EifGQ`m-;_;*4Q-SB`<``7Kk(NZJ>_>T+xwkM;=+%<#+>6D$3uDubzm}f7g68;c5 zm4WKOM-&=MysiKIuc^L)A53(cVf~S449GnhF=DbO{PLN{;jq6Q1%M^)7HBdMgz2d< z#bsg5f}f$mFr)So5^lpG`|9_};pg_Zo}2lE^^HL6-?|31N-CJMjmzgME>N@KN?n%<4 z@K-*dY_M+{ab=ISHRtx$JoC$HKof?{FSU!%<&?E)TVb?02HbB0Cf(+ALSBmCy5`UJ zXNTml_#5g;^4rYwp-?ekpl1PKc%=NX&eu(9PcjJyQvFwNEUFvqEt2^Zjh0?2LB#4T4M zA@Wq&+8(ICet2bdu%WOx;fC~2t&$Wr1zlxNU?KY%Efbwd{uwvT{apnrVE_Fs6F8Lj zv8>!@&#m9d?`SE@WT|*G2X31n>2$panBR6S zt@_f(iMS+S=J!0h$W;`eV~pzk#C-uM)I{Lmvw#Dpo!Yh$$-QzQ6Lv2PTMH80k~66Y z1ftN~&$q4Gn1Iav8LsFlXn_%i z@qMn0ny}b;dn#O>w;~3^_C2dnZP9i6HgAPmA`M)Q)8?L2W+kW+?Qm&xQ2`O~ISPu5XXy>dh;cXGk{V7`d&T=M zpjfB+lw+8h+yb6G>NlySX+8b~c#YDnH|fIG!|-TybDd=uREoni9+&Z=9+&p1HBulp z`|4Ed)ud;MfG35uQBH`;G_j{zD0p&f&1#%%^@~Fb1Nq`-R7FcRFcbUr)I#Gm0zMV> zCu8bF+`(=;y@}D3fUvcwGXb_v=`z;f5MEthGlOyDh*CpACf=z&IYSwDcZ|>bICn3q zWiTf&Va^6Xxcj9$6{I_8RmiBe!R$)pYblrHEngzuP(dR-lowJLp~On|w3qnI?@ z*CBJVih{s=YK8fT48FF)=-0SG@$G%2cn}xa|QYU@DFi-erbFWU` zt<|ZhJ{M4j9`k28q?}%K!%?_GI7JxCG6LNH{LhV_8R6f_rf&)QqG}b?`)qK2CqHW} zgh=m-f?jdi8SHThv!xZ{Gx|vy#Y~$HrlY8C{kYa3ZL=Z_r+#G_n5b~MI3o+%;Jm80 zQZf1Caa&=OCFpW3m@q_0XcchI7Bdw1^0=?kczqlQTIwHD?N?w@^GwPWS2oAmTAmBt zV$kEl{!X@j847^&uda7%wV`}Wd{y^O20WLR0xTi_i1$2zHISxOl_46YquQFf$HIN0 zZ$881Sex=3%esFHWt3kj*gG9oZbw>T!-&5}iMncoV6Ws)d08l4hPr1OBUoDz6IbS$_76Co zTUv@+FY)*?2%c|WD@%Ez7ujNKSK;Aaw zb`Z>GnK{;bP#g5yNzNcpy|u@n+w{+aNmO1uUglilUrILTWzmjShSL4EZIY3jDaq zWj7kTv1STf@f(ywK8y>!H-FXL0uKT_Od~}B!LwCYAXTa)SYnJ7rg-%$h;RK;f>aQd zhNHo=!_6A({RWGr&QsM>Dw$#g4liWo2719MLDr&^{yPJk`0;`GfbT3iGRZ>*ofgjBcSXC9*7x6n+5T9xRE?yW#+m$N|e>=8&Wg5t&L=%7^1 z`lejS;s3_BJ!ze$JIGl4&*4ziKR$7tOrpR56w1c*=S zPB(G6Q8H(;kA?ghvEYF)M6zFQf)G9och?VSHfQS2T;L&ap&W#{!#F^^FLt3aoifCcDe{1IN}#IxfP2E<@!Tm%tQG|^v%1d3Y|-zD!fj)rlsFI8^D+s zaCWoq;eY-Yq819*wbY_gQ&w#_rC{>Bhk;qk-4bu1{7W>MsU_gj5?pdc6Jjg?E@R8G z>s(i1w1R6b6hchLmG;(!U_r#KGE5{j53V$Aqi*wd;1i#3nRJ;-s90uFm|tPzijv={ zB*U^8^l;8EHRPleO>GqUEKJ7w(SK^KxWf@~Ki)EefUHC?vC#g#z0~G6$sj#7q?@ai zeBTLs9D+sxdXKRk^L7Qb=KTZa!M6>@?3GdG;=I!GTp>s(@JgB1fZMwzLM|geGAP50 z46L+aI|8luu>kMMQvCwy5l!pY%_S}ZxT*X5X@GkHhAxwE8pi#_1(%%}voLm_8daLA`|@g|#s1{{MOUV{1u*rfCn0?^PfoAX5o?kQ&wW z0A&Hu9$-Acr3Dxd&>pa80n-DfN-Z=TXrWRfh1s9d?%8|J-JYJR%7`bjsv2CvXXDyR zy?>Ww$U(yT{{0%b0id0!oOMNVs@xmMtt9 zoWI8y$+c-WUd_p#x0Mi4C83$E^kvldED|CpETSOCMyIY|fJt#tTsTxXjb(ePV{o6M#S=Sj3oo@wh_uxfFe_uMC#PFvSi36s7BWS^I9tpHlCx0oJA+ar_9 zlIItB4$BtvAB4HFQcaOnIa4nDg`g+t6Ol{}6g|u~y|Erk*$0iF@>07c^srL41ITj1 zYFKRw>(=k-j^eS4O4}D$Hds>D4MWXp;3TEn&O=sKSa5URYV|_)ZahQNMnb%$`dL)E zm=%dEYtqA&2T@pE*ub&EAO12;7xJv}>PO1Rcr^g-{H0JFY6wO^kBo7MjClm5GK5Zu zi}M%upHRmr6qo%%1(Kg$>}%=C7}teb(uFQyg2O3Hz8sWC)jb4Tefo{IM)Yol~c!Hw}* z#%^}7(o?VxIfAjoFn1$#m)F9NZiFE9|LoO_& z@A4VAXhz>ZPQVI*w;;~BWao+@g5HHtLs)UGddX}POPDcLx8kzp^ZpK!WO6og+nUcB zIaf4`pNCuT=M|($@mB9X8%DT=$UH+Bn$q7Va8HU$gmgu(MBcLGsmy}_a4o5C3C@8D zn3earo%}4SX`=p>^>pIqP@(nf2DDLIQ?B3q?qP)pPlHjt~=*6-56S~@3il!)&W4gG`*B`_$5*_N#e;?zpK zeNmzMo8Du)>Q$)Q_klyBFNGps01;f~??@j6x=Qa#TDdN@>eqT!>$8fXv>l>GUZLQM zT}qO!S6jO0eqNk)*)?SUIAFi$J~1!OwroK??H6QT!& zi2g+%1zJhvvbH#T_)6*btdO%4NHIw;cJtNQw!ZC9+0vA2%LMl)oa?{-=f1V|enyC} z4C*%QgOOR;cCSX|^h>x=-)80~b;AHWe*|U0B01VKS`@-%=nDB`83>uqYDM}!gZ8L7 zC;&@gpIp`|Bvyp;|M|bzB9L;1e=qU70xP8nLBOdMLX?-X66U$7-``Aa0SLG`w-|DI z0Tuic#K3c{8ub3e!!zXf+^XQWf+vj3TI@453KO2iJ(m9I$g<(`W$!bo!nPnh5guJlhs;Ya_gWexEh9qS=P?vk8SIK``$}Hw9x4 zr>zkHZac0@(pOA=5p+4xFVVMrHUc@2hR<9!W1{2q+l(R0!UE*?n-~bnf`&{@s-|95 z;+#<-hG~*!&wSp`7ox&_F7r>wg)%_>3^3I%ee6ifxR8h#qDR|HXjR!-P+5`m{-k_z zvV_Zw+@8mN{t8T33&kK7XCY`rY`@)pB`oVbL0=b$xTlyoUZxjvud=`1z2B4ondp9A zJjwnHzMofC5Q5;AS=l+9gMo3Dd$wMD;)FooV<#-T@*iAT(^m;i{qi#500&j#-_-@G zOuYs7@^a&oFkP02{bQeA)p{mF;%XUhjHqOcu5Fi8lr5<+D6yyi1Rut8MdR#OedFu~ ze0S=9Uz97Hte?-)`+j!Yn?GxTWD*h_!~)0$cx6NzrBHk}E9n4GurH>HzHi!Y;AmO8 z(sslGF5&N04Ho%Av6Qe}dv2xvos}E)5aFMXQ}0VZrF%IsK2>7`b(SC&>I5aBC9HFS z=}jpB*HsfCBY4S1Oz{mbu-K$8+45pgwiYV}-^kn*vLGGzsZWiwch=7$pRkscR!E+M zNxXC2R%PEeU=qGpeleF=cf({25WM$;yO!u#R6(q zygFh)UFr6Ho>NxzQrJzlC_th{fG9bQXD{s3FZmOU6Qub>Rk7aXpfbb4TBT!n4tF|~ zh{L>;lOeMDZDf;rm(NxikiGHYOl|N`k7YAlFXY07xQ%1e1+&=3dCx{M&Jx-BQ_l`N z8f|8INQMzdfai4^Ws}A>n{Yn~@S)Dd(0XDv_+S4sP;fMz`s!ST)Wz9-jD=PvdcnFR zn9$GEYASHa_5Lm{D#Q$cf~kNTf)+ymd$@cLu}FbJ-e!QBUi754@qUck`quA6Q2%UH zEg+lKFKT4|Y{^#s8@sNct-qt&p#eX4kA^GzZy+}uO_{*_z1TnF7Lz9nI;w=g{xos# z5e9!*UP(WO$43ceEohKbthk8V zh@9cWMdZ5W=c%e&Z!0y?vPMCIvu!g9Qil6#DWt4LTei^SGGp?0a!oA}O(PKQ-^=tk zb*|6b#QofyjkM?~gP95|+^ybuHs?|%ty-sFQ5?!=ef|oBChT9KtC-N9)qRp}YnKIT z`z4oi-&8nH09in$zlQWaBiv5m{h9^|J7VPJdFxlCP?CyLM2PB z>ctbsf3u;xZu`7&x!}rzy{QbvxKn>mLc&1iG0fjQ%I5{G;tH4)wxCThZ;K%$hV!jng}p0fHCh$)G`YXWqLvCL zHxLr7?^n1K&%`7d8vNO-F0l878VOfqi})OZAaf`&!4Rajv&YE8?^Eg|!KN#mpi(Cr z)vZ#1n5Ya<-GRLN&h*0GP5iu&NOc!>3+r8nE1PFqO{WRg*Mu$Z#@9fi@?9W%mz`^F zlu{lJ8uy-0ex;wcRA}uQCL|C@8*Az%vUMj>8H;hXm+o&2{vjJOFUvXScpWc2%iWIp z;l2!PpDiD+IbY0;JM3^H<^(T=0)tW;K19LCGrTAFMeMMlkLDe(?6_ZmCo8eaj z%u-wAF`3*2i>ke6+h#R@JY4Q!Y zxP$6mlZ>7?ST-bnmcGgAYIzzk_O4k#NZfILrc>;!zmp6vcX~sY77!0MKwVgf2V~fw z9g39}>gU1XvC$Az02HC=n`ktJp9ai%cD~^U4Pk$&BiOaiDPA8Z|>60aIYJeor-vw3!jP3dNLiTD$mmQ^M4~B9^ZU>BAkH_Zvw(4NDs((8r7!WjR7BW!NCL;s1JgEfj^irRot6IrOGr)q><+<|sow|>Ti z#g&OIvb}s-0f3cd{8g1-D2b-qv~OOR%Rq3Ne7Mjq^Hv_AWDR(bWOn%c&4pMY*?m%^*{(Gc)kq zjMu_LR(LhxCNn#nAWj#fI%n3QCQ>s zi)b~9qiQ!bY%CaZwJh%W{ilEM2_c~!6z6aL@5zh%;66NsS~^I{D^Zcfn%9posbAkF z*#@=sOi;_M#bLwQ#czk&V55Gp!A9c3x*+=jvkOfzT$buZtsv6yl35jq3N-WJdGo#EBCrI5rL(8Xq;Lt_i*)#UxT$G<(Q435pa^b$HZUk=l~0V zWmIdY_hkH3Tg9jcC{(j{CX~vaA5)(-tA)~WYWamd+9a%vgaSI>QBl~q`xRq@ zhXa{{nEAJDDqk=FZRtvQ^aQH$k%Hl}3l*>ab#vp@E~ky)3wfKI6>gtS$qb~wME2gG0}e8Nm; z%HBRuac`hxBNd+=EimK;00=h0ugH--oMo|;2hdGPNZ>mFHpyl@d(|Wx#z^>8OQdd? zF!G)`kF8OklQNJYKW_G^7}5{fTLq7QaR=i}Z+?B9_1W7%z0FAM`DMGfPiy8EngkQo zCGXNz3zrtpfyt!3!0>8zh1%NHVqPO8-(nurDlQ@PDTjq*0g27R?=r?390A0g`)rzy zJy`9+10|p>hE{@L9ds3I!TXVKQOJ_lTcvLRN5XD}v&P>(T`*iPwJ?x{f?8<)Q$=3@ zL|-uNVG6XPF{|`aNw{-jZf}+-M}#p}j`b1^Zs}-mH||;@@w|o}i6wJANZ~{;AA)Y2 z<=nQ7xi+_Cud1lzQXa+u$$J{%flX3LZ2|k))J1W#LILCcV=Vk@Wy$UW*i3X8WJQ!4 z5hj;O!y@*}4L25(Z}8hs`Qa{83>T zS1-(l_f`F$ha!=oq_~)Fp!DBd!T2*Hr(ooAQf<39MQ1r)euRgiQw>*jS4G@%|OyZiB=Dj&l!^LgkW4`7$A( z4<5&^34J`V>-iW0gye$yg>k@ldW#G_BDIB7O;tPb*j+@B&*LRKe!0Q`i*jGy;qIim zeUCLaR2f-P!b>eTORrMrhmc8vcT-eNKU*)8jOAtug0`6Hx>5P^<#|&6T>+!SY7#Vo z5)~IESNbq%$-P@bYC60oSc?;k;hVUR1)KszZb~s#TqJv0g@W zYUTEQHq>6#r++Deq_@yq6-f(Ir0n*0@_P&d$#h7r?|)C^KUoL|gbaGCFlDDs-KeAc z=lv}wt3pQ=>*9E#?&O#`<~*tSPF1$t2MF4eh`&{<>ufua(YQVP$gz>uV>0!1oP)gUUcmsO-NtPIOw}aJ@Sqp?{8>OI5;Ol8G!K;Io29W&U-;KOW`>PLRF6rDEdFB1 z{oUr7UWKRtH~?Le*4Vaao~W|OPK)ByOrKWai6z*=y9EB2yp)PtN=i#NdlPIdy6+f z@wMIAl8Og*&vsbMph%*a1^3}S8>B@&aqe<|7D6A=IuaA@I$d~4gd|=Au)R^Z1SP*d zG~zj2KyJ;R3hqlWQM(S4c?O{k25*`f&6n{?h`Nl#1+Sh}1ux<+m|cQ@1(m}q-2)KW zoIpqA@`4HfhcO2Z8VjoP5!k^>xWH=pL*D;`qW|!Kyp>_gd}2J4Y~GtzaQofNs-Ue2 z)PE09qE**Y)@u=G5UJkinLk2dg|verR;XGSunzhZ+t-s)1-PoJ_i`pq3asBBhDVqe z=*E^S{jeMZn0<;e1(>{fi*|)*+`%Grb}3e1iPLXr_^^AT9lDrDwD;^Jbqt|IlFdE) zE=zmjuKAp-QGZKp)zba9KzS30+k9gLj|Ok}QR6ev-?6rv3R#y-Yz~So8`mz^#B*om zccvo)%iXd2mvmh2&K5~Z#+y?e{slcf}EHT^0gEm=U@_Na_ zx7h=Ylf~NOS(^W(Q=_#_{Jt}yhGy0?$Fzs^RP7A>Kvqqhr(`IrXg>j~8)-IoSStsZ ztdOIsM*GTOtaOU%+LIq-D+dK0&?z7Ve+q{|PtqHMfB7BpK_byWq~oVT(Y`2{o8|R3 z3eZrY?(%nvUEF`GXanaIjUXvmpJFQvE6E&==4?g0m3dZJsDefGJ8Gx)_mZ{G zwYVptHbPvOQi5%870`mqqyc54h(@nb25G_a@V7BE=&|;E-k<0mIrtHp%7)$p8DM1) zMS$?_2!X{QH4OtoUAyRleOwzZ@H6+B{+D9AF(`tr_s7=j4ch)&+r_@5 zb=hFJNVyU)+(04{K%US0_gAa|>XNivc5my}GLbghgx$$j(5AtJAb~v`MCDwa7pm|V z{cq|%P^{_Q@3(04Hba3-L*v?-XO z1uhpt{CCq^aRdzsFo;k?qb!bKZcj~BU{EF}9o>$sVA|(BUsha!Xr3>8_-)q+&tKD8 z0pE|cXSbKNG8B2mx*~G_1B1^NdxS7NxLFq&uTxyG@h)C~rR#l$)@K&1oUF?|*c9)q zw+XXbdGGHZ3PZVUAQ*YXkOk*=d+6f?gAY~1x^01><@vlR<tUCm1f4+@UX)OsB5Zhy)ZhF5Vs)w1k|%ukQdLC<;yWq12;N&kww}_B z0-9BGXKBKRquSmS0lBD6p<*7qbBxhO3AtgC5;{G9h`(B&yIArORLpGcFRxw)D+m*? zT3!JsX+jhVO7kGP5og&qCyZ0Ek>wk`TKPp0mA4L%aX^H0)W|%q!lvKKa!!To$x4JN zaH*yzVGuP40fX!uq_l0+z<>Yh*}8r;JS71gL574=@YY=}FgIP(2i}M!>j*!?Poe6M zJNxh)q3vvK@(fS;?f_Bnx7ilhDOZV7_&{Tp4WHBg3cj<@GlWhQ|4e*X{v9_Mk<7Ep z(I!bvtIV&^W5C+YV({%hGYG8R#|>VCNJ;5@f7W4%caqD4xNz#Q&!+v@ zby)djV+CYP1G5UR09XwK8Ls5eg%^?(;%gkn}V9ivFx?e&Nd&5Z#JgdP$j@ z_V24IOs^36sxJhJiAmgE6j$zrm8kB=%LR&wMsitmhWKr^aVpQH#fxGLS|EOEoOPSD z1-LA38CVFKW6*7ryk#aiLlVnc7xoxHjkJa|K%TAl{SwKq6)S-VuB`ceie5PSKAdNf zSQUSAQx^4CI2ZawHB(E#m?zWlSy@{_$c*};0FzNTWrf6hM#aGC?DLg;n?WyNw;M*% zoad|5HYSXKY^N4ylUjxQcd;kSB!#%rw=;!s`xf-CX|t|EtV%;Bk+v4~ZQ^Z>h~i~A z;dT(8se~@#RdZ8@nUKBi1w+(EKS@^0J>SnZkR`Xd%;p&ZQM91lxW~OUg7BWKYG_R* z4;D$ybIBs#_WL*D{cyp}FXk~BG45na@UoT#nbt*bqR_=TGUSlU9;DYQbLGlCA!vx+ z<|YCS)->J*Gb9O@`%5iBE!bbG3{nd${A^9gbmsi}DIHY*6_E37>wVm|6`cDFLhdKq zF90B&5eX!Sz-c&g?;e1!P8=5z^kVvV%>-&wdXKr!`~Q!db=IE&2Bncgr#%v}f)57& z^*_}-U@aN-@Eok18unO)dP~P81pFTR~!>yx}o7aPQx{XF-sdkCE-raFpxjknQ zG2F6v@p)^dVR$C)`FP#I`7|+=D1i@brt&ibrr_=0NDc7pNaXk9_`FS%_8wDmLK*Q% zo(T=zUTx7^ZNO09TXpden^o{xZf%r2Q|5xurfd>SJ2lwjA{R{dyhV+)iUne-uOo_j z3D)KBRJ$^Vu0Bf#niIRYC?NOIuZ`sF!9+l6w0|#Z z;wsA~1cH2v$P`n%50>6ej<=01@I4Wh07F8rz(lU;xIL90O5lacH0t|6-X4`9lNkJ3 z9b${VdIL+@#?UJI&om10OQ;rxVeNk`AGAuX$qnz;vE52MCL|j_P zk}`3l%nfY2|lOw;ej!UBgLi*zxeXQj0n$YfwyL7`*ClOkBwvN~w9F zCI9#npJSNqte{AK9!T0vS8g##Y24&r+Oam-80bUNvx0sLGU`u;LT#}KJ~0&_$f3?R zd+QwPnM6^6yt_-u0AU>Ri87q!ana+sgZ5{!m@D5RsKVl?9jEy0K17Aa;sTWtuHfpz zD>j)|PD?X7WQk+@zJIZk5;iUWM#INgT#5JbW>S=Y*$j>HE+9|gy$bRU+z3$AZu1%s zZUQ{K$Th&Yc8kpxAEKefwHCCX#InUhV#|*+ye0u{V7{s}*+u8ICL~*3x4;>r7`!S~ zH?RK!ICZ#~m|8!;UzI6c3Zk<7>vCha7wOsLxtr&Wx%Gad#BptZm|>SD;aa9M>2J(} zwTfTV7K5-LmKGW;jHoxt6+&o`Cn@lddm&W1&H9(xynYBfA%I`MOEGRdMH3AqmEs#7 zXp}rf-g{k@d4Nk*En?y?BOkP_gr3>xHk^A>CHTMoM}nYxee}))HDl4Erp=4(9)!MPk>*t`J3st6ABBQtx5Eo zNt$2oJN~?lkfx~-rz68NkI1og;rJ1=KQ$^tY1g;oatK`%?Za=^Z5jkN3Q};QlNarJ>#WHH zf+1u+Z*ywN8fdu%KhwDcI=Dr`JZm3w3O6TF-oLlCwXIt)0-{z+AHBxI8)-Es1Y)K< zCws1NyuU93fPnNxJFBDvBz`htEwQ~vpElZIXfVCtk`dh(8ks_%D08r6wv?Jtz0d0X zPP9i~$*HWgWih@Fqrbn^palG;-LnOw$#00WQl?{YRcoE)GMy1*oYx{Wg%M0ws4ds+ z0Fk_(3nq#oX3y5Lhpw$~%N7VLAXEE3Q%ri$3l{>KKkugZVK9?M$K$M_xwNp zSEJ@viHnifRG3fZ1X3LY+=g~kU8KhRQWwycH#ChkT41tU~nc; z&faGUSXe=$v+ILt(1-F3TQ_;)5&k*$f{SYHpFtunI4mBig zo_Vg%GE%%230e`>CjeQW=kozHK2|b54#D63fE?=V`{0LENX#>6#Jtb8N0jUDSrT;*U#Lf; zgs9}6XbTpATR$SZGT!r_$QTwHR}LuQQ{i$QC*W%oiA{;x!6Pbqa0*cVIq~mX;>~x# z^n=DonU!#B2b&lWDPi!_U-sMb*Tsfu?xog)EiI+{888qrmKfwMd*9-4kJWba{o_Z zADh~nsttSPmcZ1^fPB{8ePN1}mT}P}H+~{7w~E4A4vJ=->5^1`47&cl=!~gFYAaak zsvo$7+?^qxC3s?xcy(iOX?}vJN~f+;qn$NtMGDELd2@uL_40I9LGRXY{Pq|DeYxB0 zm{)Na?H)iz71P1xW+_Vy3mT;vmZSp8Flk)z0d-m)+lO>#~+G8Nl zs|cFDwmw)2TAmI0$(pa6+3Wov+ppBqVj&vU;V^gW%Mz@^?@ev0%{@3m0)lvjpFxHW zAyTU>P|-nWkQ*OVfr3ruy-N`K`%?(4rnb5sZE+PIl~fJ@d~aEFWR}>{0R;GRO3aqf z6AdC2y$cTR^S1;V%T_AOHpoTxN|P#`m(RMgk_6C=401v4{`QT2EOLYW{+K2fi3UE% z3Kp{#l3vfO@W=o9ACHDX0tnG6L4m(G{8^0_AvtB7MZf)iKL-GCUNPk0M~q3mIPbC6 z&wCcNS}}c}IDW<`fcv`?!uwev?x{K7GgS^bpOmuR7TRl+XXMSRCs`R!I zM60PDA<))JOHC!yOh9NgLPfm$8Il|H_gHXc1vtn0c`jL_RG5H<{G-`2Lr$fVh_NYM z*Ww*l7LY5kdb5@K8QtvX%bI5hLbr!o3b|%Y-On~}qzoA8vYoQooKX)dK;KR;^-ar& zQy^AM02C5vP(PiRQBMKC-f;k~75Zg4TIhH1rV5CKfuHH6oP)(mIleszMQd{`%o+NG zbDZGnEMY!(`K#H8BPKKyDDpm6)LTUA$3is!exxPeQ|7IJjNDl+iJltv7iS?xChoK8 z?=gUVJhc+C<@53cP6gbKbMyj44b#aYs%&2h+X40&g+p;_sutvwT>tz38^Uy~&tJs; zlNgvG#Gwnfd^YK9FUmQ8YI4b+YQmQ4F3MK#JXk>eEr1q1Ea;xAic?z%ED>L{Hu7^t zLzP?BRb{~*ZHi8T$>?}-yv2pEssOHq0&`h0Nd{ZMG;Ov=a;*Y{KJ&b_7P;K!3RATr z0=!+IQ~&k9(DVFU{qyz|{*C3O=DY>1pQWw}2K;#&#BP=8AH83>F(q!l@aGnO17eTs zQVg_+>ZNXn%vN31h1{DQz9`uzTlb8#mBr!EaxaAZ3#yp)hqnw?9N@gia~y<;Xb1q2 z{_r9HiZT&m8qsvN!o)rGb9v~mY6D@|4z0S+pe|7fIcJV{(Xq7R_OMKZTSSdH`k~lp zTX~;B`{(6~!*5jppXov4#fhne5LQ0dMDN*cyOUI0{#8PO3y7hCzT(#WR^eY{mGVmU z}g?P z@_x8hfb{0fu@h)N*XI_&f%+wWEs=__ z0ev#;aC^|^a!clWWPo*}94Bzf=&LRIc?p&QZMfB=tC|8h1TYV{<+?)M|J3G*icEdu zPD=H=!^vNF5&gJzn;4SXB&Zq9W4&pc9S%dYgesgbNgweUBB>pe4-T z4Zd{^&VJI9b z+qbsC%xAYT#|%vucSY?pV6dE{&40@Ze;Pov65PT+Hx^6#cc&77gfiL^_$ArNFNfJ$ zoZUmOABS=;oxWuyO#M?{eMN-n*UzYOJNKBi=?j&))Y~#vDB%0-StP$T+xyTlYMEz; zJpl3q_i%$CO|C&cZyJBCXj!Y<`LV56J(Z<&Ybn+D3?#?fZudMB{F)@71a(C**_t{- z;GUQX+Ez^sQ!&AC8(Xygn)LBJ<0=!kvkjM*g6bo=J*;`|W7I|26tZ}JUGT7_A1L6~ zhO!Aie~)DX)+p`YBl4AMT~l-$+2)*A#oxzpCmhLwW%2#I6{~I`0V)*^7!m;FU>*aY z;pcb4bu7G}m;mo*er*GLHe3B|1jR`s^B=6dNpTaRkZEmg%s6&)w=dUUO%ds_zHyW* z5;OlWfJJjqm`hUzvSg!*HnVoSpv`_95e6=at^{y_yLD8(WF-lxI&c=H)zM+I7=mxe zMvBCk*}@Vh8WNVy-eVi$qdts-v>h{Go|VOiRZCUzRO75^&Z~NgWpBGgZHNI~RnpeS zlSyR*y~a!KCaoo=%v9T$Fu!1{AZ6$DlfR$uFvnr-@pL%aA>x9E-lUvL@sN(T81p6> zkhoh~)3?k4{$Mf*grHWK#HURZLuYrXC(AFj7O(@GINr@^W;1tIal>cYtJR;0pEQ%_ zp~7A)5Hfnn#T5Q10B;Ii;4Wz*CQvS4vvxOojF&#O$aq*LAx?p}y4^+8{pX|!5$gt- zzwk!K3U_|SMx{s8x9-?>eDm$Lz)IBUJGzJ@fZoF5pE`sB@zYaPg3=vTiV8SyhMoS-*rW{?(IW zglwLp$@82F!Bzd*69p&BV6AEvnKBp8+b)ob7*X`TkOkG3xY@J~2?@OKV*~3vGjWkf z=oBY|`qOO%$5|%ifBkQAmZ?2c(OX>7tYFR($PZLc_{8DLQrz$7l7Tj?aG?9=2=@$2 zHx=$lOwx@B_gpdz!7YN5gI`~%B2F8k$@jQq2Xyr$NMULp$N!c?hGpQMXr4ulEI(co zwE$!$&oFKn9D!hdK~X?*E4)SgYORpkXp2(EV(5(vNY)6Jw|0(8!9zO!N4tnlV_2GnNMhh z3_@r>U&2%zLZGp|ZH05J1y|b6Z5kExe(nht=jOmn8QW_S=qrpH7247+QComU`z!!> zKezi;y+Eomyib%Zy5k<`pBMrFEJImt4j3h@<@N|YEBs#HxdNBJ!G^j7_|Mm;yySza z+gJc#B_m;;H7ePPl9ZosPhT|_1h8VZw-U`UmjunE?_*q$(Ehz?gLW_LJ!?q_Ox0fj z+L{Vra{k`RO7zqjgO%hre6WEuI`ey27rBJOrMBM3quSD-v#gQ6Rc5TpC4Sux?h#1A z<;-(yOnw5+e73v_;Qn>c{O-wEAuS@)L@ezfP`C7wzHXA9H5d} zy>LhDS1&Djxe!g`WEx*vh*X&Mff)4!jB`O}i~Tt%N~X>oo_&!7qkvO*g%L}qiUc}; zAPA-?M$OvT7j-6aaG>}n%aS5+I#{Bl}ZNk{vo3iu# z+D?PN=95DY2RjM~Md8tL4_Y9C@|QrJlUkZJ*)`oB4GPChG`LFSUVwIUp9>I-w_^+3 z{kig3RisN(NC>uwA}Ddp9752^D5{KZlWnYc1!`>bN@xt{bOA?I%%PJ&Y6oXR}7fB2cuL!2VKO z+!!vLvniDBtXzJarF@+QBOqZ}Y2q?{*i^Q<9%RyG-qe6Tj61|VM+_77l*In2*66d& z9|}4P^PJxJZ?`_cH)VB$%Mf%{$J%98DmXQxwNS;CfY>e{=Gcx+^2=dfFYjRz4t7%- zX;Wqps;YJaa7w_|+0sdUQ>qt){drRsJ{P5fpqJE!^KH6nACVwGAbUG`e$^33pN_;j z$^fo8+H@M`9RjkJ&_#_Dx-<`Zao2tAiu}?LKjms2KXMOH0JXT-Hn6{Nc2hizx>5ts zfbx0wMP;|7g4u}q(ZtLd{z5Wde~|6J@Rd*}8;D3ToqckMljmk>IJBRXwEpd}GAT0z z)hTv^S9cvCg?x!efaeaWed($NIN73tX``5@@Z0!kcq}f6^DFWhWm~RmmwkzxE!YQWr^9ppD@W_lr6U_i{KpKX)H^fe;57Ay`~& z3Tmr(Ho9nGS4DXpFyd*m`}^?}9KJmkeZrkA3l@Ux(*O0pAxZubwqlIZS6DtaguqHD zCq&Br++XDp0_VvubA*4uh!u_c&2!It67a2p%jEY1;EKFW04cl&?#b5A?i(aI`h^o>)$%0(Mk{N}W2mBUNrr`qIsd3lmv!TMDawr#h%RqZ6j#8dywH*`Yo|Wq@keap+ zuI5~vgk{_r>Q?DWObnj4G%0L_+T)@17IL^+2EVhXN=^?5z_dC`;pXpZ z`)vk=LH3|VxSih++%kakD{2HeEKjGthyA8PSV0(o$a&9@s8`X-zwDGU_!eQo3Q%#* z_Iob#I1EQj+w4J#S-F#u%hdijb-;7e-mtwNuS*U@EFoaqf&1;BN9qx{XSfvHUmZHqmbi?&(`m{GT8CPMvnkSa1#CaB*d?a#7xtiVbk|Z*Qqd z)eYhw$;a8$eK;0Nw5plTiw_zT2`wvVNz55ZVY1vrgId-G%1hD81)SxWHi|{e3z7fE%QW z6!eG@;;*-Nm2Yvu+*y6I);>-7FBfFVOVpY$LDCN^NyAfpi}UkxFV(H&#YT%Qt^g~b zZ&WFV<}u5{$6m^{Ry6<;0d8*)PhH5zj=4;nOOF+AdX38X-=kw<8Cc{nlqpsW+`=D_Od}Av2-W^GTy@YK+F64j_42|R zfd3NBAP*a<|0Gjrd0sAgNoSfF#~+29W=5#R=8aCm7ZioSTQ@n1>rA$lUXe{*!P#M; zW4b}axiy^_eHS9$*wAV9W}W2MnCCj)RAuYYWZ8RxTEt)C2`f8|#h?TzfCh3-CBW$L zu1szUVLCSqfs>8$`5np1wGqF)#AM~4H@igWU6$Y)&AAt$ua*(%DKb9><6Y4`%u>0P zB%32506Q5CwMVJeA+chiqD)>{fAI&uHRxi6ic37V@Q3JQx&Hh}D}Ds-QQJevYZ6=0 z(zkVJ5REd5gID2w+tHrq7?^!(r%7ycr(sZmexg#SwVK5~U00{kRUN>>{x;MXAEEsQ zm7G>HN^S+I2Z+)Z)(mKwn5_**Li+N_o(~8_3X3?{Is_8pkH{1+?W!y@ih<_VC3uU0 z5Bv7}OC2`-pEt0~DDE8dV%M0P%hA?8h<4x}8Oc&1oq^1nLh5k0c0) z#VYu-t!ofZu)dCw4w1(BwIMJyS2OrvlYp8qWC zGQrc)M7tTr`vBDK9IUFcs|ugsYRN*fG@`BOPCj-IF-9TSv7}K_M;>DdBnMwGrhOkX zaFZ?th=3BaaG&3GRl62C$#364vW@V;titN`sg=gBQg_r?fT?&nK`B+p=Be!$Cp+59 zfZK1@aN!B2QczG7Xhx}^zTWiU_e;*WBCgWoWaAL_8wrrGNMB}WynHxM#@XoArTJqY zG{-EV!to4FuYg;ZMsc}?GOwhBS8Jw8u26o*dHJPLZ+p7&)T*w~1DT(B%T?gASO{4a zrL_+Qaw>S{9WgGPTW0iZzTno8kQ)?zIl(f=?^fRT+60uwUC@{)It<#e(@0!inmFq{ zFRInq2y68rA+yHVo=+ZL(E|1blesXgphz4PsGjW&e&i|Ed*}+t-C3Hq{~7{=b7oc@ zMO(uUYF3YF(;GO5@%h^tmWC01#NCVoYuTKmqARcvE|oVDHj)foSe?aws2BU~B@u5! zzNbA?SghK-kPqN`5&H@YAHQJzl9UBin60J4b3#x!9xFV^v%WJ&cW(;6o~8*ugLGX3 z;4#3<+ERu`vlP`<;^hUvuTs%e;-uG0 zGpp9`vZUgC)H)^w=1=dtM$ad|VxPe*nwJLEy+xf7_u&>$l_Ib^e;>&ku9D;f7SO5p zmnTl%RFCX}E_b1h{Azq~LDl==HQjgpPnh)w{g>g(!d0OVpf9 z@+a>58aPv6E2uNBskrvnfX8ybMb|Lr2)k=U@Ys2V(OtmhcK&K{E^&@KDSm6o*;y}8 zH9J0o@~kgoD*XwvNpM!}9YNJpeAn#zg|Kig_mY)gc5JF{lndBknn|C9_p`$_!XNWv%-Le(nMf)#iT6Lle=su4rrRkY zTX$6z0o;`r@we)4Iq1V)eg<9cWl0W!Yn;0!(_8^|hkyek`;LMZ{ zi!5)GWPxIod)z#;;K(`L>S}UiY!xpzgKd$(dfq~LSff}fpn~s+(v-l%-*)sdr&3g~ zXx%nXxVt-4t!_zfLfhNrk<$Fu+hnmFy+sFxk#NP}KBpQl!B#?2&+Y>#Yjt~Df^{`) zaWbzmJRl0l7bHmwKf}cdl0}i4dtYP$01XydFlHPA#Uuo$eJ=H&{Z||mGWwz%vFfN4 zW)Z-#e14tQdAg#ft-XXOt_@jOR*JqEtM;I?=@J)(Vn=%pB*D0Yg3lz6s2XC8cvIoj zYvyQoheiDVczU-YX@Z?e(9c2O5FCOa2tqK}WbhY*X$DgaCK+r%5ClPR2o7N%>YIJ) zHtV*&WJZMiCdK4S=TTjh&y;njJHtFiGA>K@?oKYD+B~f6mDZ1Ok3m_iYr2(t(3?%f zCA6CpIazsHfvRxcsrR6s^yzrzAr|Z{{M3H#08uDx{(BU#DEtL7FF{)+e+V=x39_uQ zf&8Iv`{&RcZnD2ksJQCR)?5Wn-sJ)Y!1)7MqWqo$(NEtFh>5PTuH!!pBkZ)h%F;&4 zM0uxtYNrSuBQJ$o7yx8lA$(3Q@@9rfM@QqA}cnRTK zyNcwYo~*v;v?+#-$p2;`xlIKoI!FGk0!H)@4R+)t6b2(%%?JzVodiqe4VFL+6S z?-`mV7>HdZcyYK;)6bjUdDY;o5=E@qTA{LYem4sOqL97(Wp$b!e`>g^63v~Zc$SQy z)B?u=fzmuK?!&^WQ~CBvp92d^XSl1d3gFq-OVlw~@Xw8)QxL<#FN#!h_!itFfT;$e zS0M6}Fol&_LAAF!qAGnzoAo`6_wMQ`z*XxSe<3gl=3%OPrrxUDOWq8g@@R=#${P*168HicReb`{$ys8U!KT2V`QMW91zS1ZT5s-#|j?@5t{9(L2 zuw3ux`;<67DBNKo{r9jIa7EHE2huKy9r!>0>wHw5(~i{{?=Rk>>jT)pO^U6pH70%bR!Ix0hyYP zCZv0T__?3mc0ePAb!qxe9(l?@u*s@8~k71)sZX4wKvo*hQqe;!jbFfJXLtU+cxl4C zMDV44wiA#daOJYsD4hG*N{KN7Jr;Q1Y!m(PTS^Q@t-p}Naj|-J7G;(%zWV?Rxl95~ zp*S|t7FdbziGd7oT>jpw3=;JMyRKjN3|HqR1Gxc`wfzk-#q+csEu@k`_rN(~y}%q! zFWPAWAs|nGZsJ8+xa=)m%MX`4B9X{zU|7w?z8B=8!UcJX1fbiXQ6{M{ynu^k-iPnc zD35}=#hQ>1po&n{q3d8i<)1i0*E%fek*fzkKSj6Igsc3LE57 z)(=NDAYmA<;(5xStPo%!l!5}1EdK=T8Tuyu-REcEA{yNIF~8vgH!wDcd5lOWwTvHr z*-oC^S^7juU)({d0+I;|b$9veT=2Tyyg}#RtFlszb_+``vj{nc$gyNNCDqkw|5V{L=IOs%WrQVVI<7rvrqa8GU zshi}L>(p65B7}t~yf_*=v&t#a4(OeF&#H>w_X&+RAX!q@fsbcd6iMymM$xhfQkvI| zr%kWUZtawaTS`x0RKo^w_6C7I3<@XM&KgTlZM|AL?B&%wdA5jqj~RLvD)3z_*8;2Z z-sOckHwo+IFJk2Y;Qi1;0A648VU@mf_HxX){9913Zw2*B%QMxprm4vvL+5@I;#sca zNrP3znxK}g+K1BjGYA<7N#wqrWV`^n4XBg3Pt}aj08d{ zW_FE}iXo=r2p0^YPx>HCl-;J^XTSJox(&Cg%i6M(IlG%(Gx4SI%Ik(Hn%y1b0PFUV zzC!lEw<<_??(15ISgR{PSN;TyvD-j8H8;z}8bTnKaQjrpejh9C)?$A`i@!obtqX4|cw&hUPpg^vGg#r}cXXQjdad*UrtlV`d8VY> zM!Z!0{v<2y1&H1r4hYZ(w|-iAgx!KbXc?nNbqVUQLk3_XroB4&W3r9?zsZEpIM%j%eNsoimng} z!uQ+*x?hTRuZf>uvm}$>>&VvaaLcj);6Av$6`=SPlLGo%oZgPL#ml~36OBkC^=)*l z>~kY6mbH;4cyH%uTD3a3enMuz)`U>MKxM?hZ+mr{zXxh8)DBx**@Jz`6XaHUnK zv;Dz0y3~&q$rT@cgZ-oez?{!R`f+=%C)xZR(Y|BC zNDzp(H<#z6gx!BJ(yn!VBwKqu=$eyj1-Mp4h=ahHgq|+{gfA9LC6DrsSY}{*gin^2 z6%1L0sV-`RmG%-yU_mE%LcW?x@ZQsPC&3C*dbSh3$X>s^S=ofU_K3PIJ%E53ku6=m zi&g{S^`hpW?B#KmzJS9u=w;2J+Ew(Op?c#u4XY9$+~5!xV?U=&B{qwfU_l;Q))Q1H zc_CERT7O}eMWO5lKz&(N5=kl<}X{R5b)JySgAO@R!e;j>ZX4{A37Zk?6wI$5jfQdY%r zDxJ8nw3Qqsi4JxRZOOVP2Su~-vHs?iT^pgyvrOjgY-f)4IQ}p6zUUo2`6kZH`^f=1 zz=|MJ%NrB`zzj%%s8`y)={l_CAr4+f`*66Zoa{yRd!}Hcu`=gSArhhHp03~SiJ3p= zCgLxzVHLdzKvpoSg3G%prpm+q#{&u>m=H_kUY8)SAO#XK12jQCz9Dtx@4rb00L|f2 z+1EVp*PCa#^~krEr(jT(;rlVFDS)7^X0v{7h$I+D=I<<~vC+Z0)t$xK!0*^JfzcwE zSLFaZK*Yan)#X7dK9FGN^dCRGGJ`WvKd6{Nqv}5A+1OwP*8m+}UAGMiZ+DSBytneW zQUg=-;0c@p9uMPl2Ty6`UIOgZNBjYBOfgtJ%VigCY4}Dv+qvW)ws~M6juaiQqC+hs zhes7d6=LOsTPXbMP|S{PkMU0E!@<1IQpi3ZbleRdne?w-te^^@jNx;TNxj1>c9C)E zRzzjRK?PS3FTM;rHPw_kQN}l|6o9#oxonpf5qWhi_)Tbxq0~uh+{@y4Nt081t;li3 zm&g`C^5-Ru*jNq-5vkT@VIbXc;@te9i)CgAo;(~bQ)ZEols40x(%BipI(5Z=h7*Rk^bGH`?&Hwz5+eB(g)6;MBtq8(0oK|SC zUP1NkIZGF;LDZgd#s1^#=2qV0Vgl4e(Jc~uDaAjhppohA1rhS>y4{N{ho~%S8G&GZ z4?RM^&HVKDK-~Hzt&;sDW{OytXEaI~G&-3=Ex@>xdEN*2c#GhdMQ~*8$@>I~NpA5y zD>+*|&{a}-CL!y6kWCrdy1gMQ_&(gUJSDshKvA$OR^6Hu=(54$6#xz^TXbbAT*4wk ztwp{MzvN-A4{vLQfWU~6X_SU>SuVaEYEAYkTR`S;BRD-jIlcxYQkKm&Z_#~IfNY%M zvh%YhgY!mQMm?5Xn4hOLqPalN;|+SF^LJM9KgPm2$d5t%efUq&j!VQcF`fk6pV05H z7IF^?TM4l^Z}WaU<>!0!6|pRBK(lX|`hM1|ZM%jRnOaiIg%WXlP{FDKaOE}#OzB1K zS>&v7ob=XE*}i;+4$9j4^)EF>+{gv!O8H?X!fkFaC&;o$)gW5O6dYRUO zU;sA;ps@337(hd?vVGZ(@E#i5q_dy?W*e>Ik*H@ zN_?4L{kco*K{VO+HzppV<$&NGtl=OV0<-w)H*`Ewd^Lo50`$5Zsa;5g3gsqzkm`f6 z8e2%l$kGxOZfKJ#5RRENNaDM9b!~n6qL6-Ek+PXelL&4g5NQk>I7kMqTv4eNjY$Tp zpo+sj2^m5EVm7u*{a+DbrZ_K%Aj|bl=Hp!kK{DORxY_f4(OHxykk zlaBeaCO~b+_LG7+1G>YIq!YCGIdNnz`2z5>3$cr#U{d2tfrCvuyDxQhUIcv3?I~t9 zW~{~h4)zq@JWWQMg3iP2526IC*^d+5h0GGJcR?wkT^ZKrn2mTd+4B<1#EAh)XHrQ;_$jXI7yFUF-M83|mLD=7%7nyzq< zAprw`Ae^Zr&Dm_>NaogOK1iXzedG7Ad0HWY-(keCNKhvjrmJp{xG5smDxXvMyOmHC zZ-S#Y;*O<$s#KYSg0F8yI6K`S?(7U0`?$C8IXexwgiw9 zvyEl1R!QCCJ ziAc%zmbL6#mDL=)pXZ^`aRj55HNOKma(?tjJE|3bDq~F$Me(uo`76CGxJ`W@tYVqU zG{0vBIZ!MhF`HZFia_d0mVe!&Z4fqvaC%PyAWV*-0qXkup9}gSM_@Y+T!i)Wf%+Y%afJKY-1@n zXWUxdznxwP^{O{<4@9H#Q_eSMEEN9wn0vo3SyNkA>DZ8rJYRe^TbO2AI!}X!M9QN9V06gb;A6-}8MI2#6k^@2T~{n#{@faHmjcFKP)umXk^p zME5Ksz*{MkaGuf646J7Z5@2Ey+w{Go%AlpCYa+#G5%N8JyN06m71-2kxl+r@{*=2F z2u7_`}KJHd8M@usohKBEBVZdnXj4u<`+54N$3*7sv_ zm7zkWE-H}`ph9blCN`f& zi{DQH7eF9bLD@a{hyIef{;alLW=8u2i{m-JK^ueZGt>se^4tn)@PE(OeY{L@1ruOh zpKK%;Aql!BU@t5;0*q3iBPL0Oac3(2dsA1(Fwa}&6af5mwv?7(Y9s@C3r1-~Y-c~4 ztY{`@qx*nZ-8yf?-!NZm$yy>tt^Y2|05@T!_KaO5-m)qQmH*Dfq(%>Qi~b~mQTz=$F{LG2RhG%8_A?(vfw>9btW3~Q%rOZTyEvg^axceGP2#Rt9IYT+*3K*xs~ z&8%Eb_`n`z!-lIE0#&t2mFXuiCh6IbTMvTvFOorWw^B#-L7iVBbKVITt-X;&AU6yA zq4>I#Wt`P5nY%h#0%)%Ia)`z4iUEkVeqa$C1o*J)?pMu++{sW;Ht^c6y6%UvF{snOkx34dYvhNfx2-|tUTlxQ&`j&-ni4HSMY!<#Rmw%{$)%C?7 z=etM@v-oXeG>9oXk_Hdb^}o+?I#^+Xku0rS~?3Vs4yj+^t;!P_0l zDix4DR06wB0;+=yh)r&T^^;G3)=-y#p95~1$rCB$PPwTZRagh$g8cW>j4pb&Js$( zUc!anm(Q*6r+(G+bWDYHf0GgM@62<}xlsx~rHsf*G7JJ`uB@R#a89trox|UCjk@YV z8Y{%L%@x*Xkb{vcl|>Zz(9>8^)_@_MoL4-?Rr%j<>HFI*yw6y=MP?axun#VNRi5`p zRDSJRPP@KM3qR?D0gdCSoC^*MTn4Q0?W^8ruM|^f>W|M2fX_?g;MVVf-t+5B4V`)q z@Drfi_V33jv1kG=Bt&sR7v@Zxrk1vd-~Z&wDk`sB8I^%rB3o8+mYYu5jChrXL2zkd zio8{>Out>z3xqZ9dR1HZeqd?~QMf4%d%0|wOA`th%bLePh>&yc!TSXC|9y5`t=PWL zf=GHqD?h>5*qq-o0rs;i*?r5E+R&ip^X*%G`$xCW1X^tcNiR*!K4)E<8Y0*RwHo#Y zL|C^_U*dQ`)%)2!1~f|HqC;XSGhnca(bTQXv%On28rGDxf{$ICo`SLkue5zT4C2oV z>7u7YGa%jOs`x_$Sl>nAY8~Ez9-h6{CCsAP<@3~&mNt?0^22ADF zHV#Y-cL*>)i(b$h*VM0El4J>kjX4pC%V8n{g6m913Lm<29HJSsyhNBq20;#`&#h`~ z`vrc(Q)D`IGfgi^8u&8SM4@>I#36L}W3{L%6zX&kQdX|XK<@*1y;jjg z2xfJCLFmZag6O?6nfkhxGi1mtD{{e&sV!X%$hM^*1aTb}xB8m69&qUQk zzyV~4%`1 zt5`Hh`_&S}!D)gJ`)Wto17d57P=6-P(Su24HY}KmE5?r5*=ZKv z&8ezo0XN>lxauS&1cZ4faO!VRq8dV=wU=m&svHlk5wxN7L4L${9Ea_n!iMts>I74vX|gs^ zIMRw539B#uCN;%yL@>2r{4@4hmXVV^-bQvS(iGsD9!6VCy~X-Mz3yK}i&2+bH7Izi zZ=TCa0Q^KJULCuvn_dz7Z0e>)5(Jy)(dvm*K{q>z+05i}JuwQXI=iJR1?e%d2^FpZ$R1;pCc1q4P8ILFZZ z*Rj#%z;EMOpc>v9z+&R`g)&=0s;|PJn`-6@+zW33PVck)&3tHtq^Y8xBzLV0mY%2> zdl-b=#>GWVpZU`P*89*+1-MMlR9)^lKe0XJ8=TtqRWGM78jdeGhiUGu4JaOd7TY7P zoq}%@V5ZEzK^N-=8xYcY1X(;AP65n;8g=bcJIKZ%k)S$K*?;zLxE5D)u5a2$yvlAN&{sKLxRPH%UQpDkm%+ed z8GBN7MMbwCQq%5YNgP6gKPcgBpy z-d{g+fPiuJ3p9+>6}UWKHcV)!?J3;P)CT@sDz!qm=S7tBcLGIRF$ln00uft6y^Lbk zu(PRKB-ca*(KMz5D7Y*uMPQhy2c`nuzI8k@dehM@3tR9RZ1VA-QZR4-WGy(N+o4`m zkQ!yHb4y0V)K_?3fo2ew5)6)O30eEQA|$^9o2+crNGk`S954X`Jh==R^w7~Za+^Mr zUPg$v6o*`%OTgy>${s|wwTgS57ZHh{d_GSGJfnq+)}VA7+WN8h<8Uhh=P_J^V|ts{ z4_paPa$RMb zK`A&2YQu{FIL9a!72&8J2jD`nwgHT6?^W-P`J`iMbOD+J4 zOC)2Nf3VQga#1LAnq9Q<;i|3e7K9tpM<2EN3u%d<;mj*Mq)r%I1I3un< zHp6c_+5tfXKpNIY_Z2Sf3Afk+lnPGDC941v4R;%TJtVg3mrNP#k1%iTNj~JHt+0B$T+Fb zT~55eaJs6(8}AzfUzX4>`;Yq2sDR93A8W&1Pr}P&6A{G>al=k`_!b{gIAK0Z2W;N@ zLzuSqSJ&z%{Q>h{Qt-w8?Z#X#8kP6Ut%FSks8qJ|Ej{-`M+%Z}&a$%rp}+#d(7u?B z1p$dy0LmKsqW=|W1fkitpB%}b{wrox&;hY!Ml42|r?0jjXVW+saIgLu$b zV}Gi&S;CSq2ST%cJx-B{v+aca#g|Az0%tl(I=OC@Q3$VCk*0_B1)$ygaSR{w^Fs83 z`vZiG%m(Ach~~_`30Myl)YbulGr$F54K4W*(-v0wj=T!uwEor#c3h)!2**jiWa!|J z5TMzdM~kw3SxHWka!hc$-aVLh0wgB(_W!XGFH3CFmg zaG8z_uf%_CCtX`rYx8RfHgt$&D7WY8jvR!*SHh{fSd{hkJJL zSgRH4A`eIF18Zdcl-eJZ+Ysm0r2q!$D^7>oD|9Xy)C%cup>F{JKnof;swDM0yBjTrNLvp++$6xpe)Y@!q0p(h1BZycEZq#Y!Jm&@QabH zSVNcY`Qn4E%MLiG?{^3gurLDB;ok01fWUvBhj>}`_jMUR@eh%WLeACx;$1!qQ=8uU zy)EQ(;d7zhQKc>sDgctuMb%fFLWC_4FCNeHJ~n+Wnim9tIqprrg0RwdLYGa1J|^NV zy@y(A2-0PEzXK35ZYKhU_-TE=55FUoCh>M8gBMgl{Ae#+NPSNrL{}$H5W=cON`t~E z3VxoxiyRaf8$yI_tpRdt+nh$+HU)Zn{Unh7j%oP01#Sna{BC*!2AS{%{s&}T zYEvuJ0ugse)f&ZR&2xGFh~pKvfh{n+2vTd-?{-&0TNs-yGK5XW)Z4Yc?q=b# zrs4v^dOthVR1m!_Zu~@V;TT)Ds0-Gs!jCX4nHA+_t=j?wT**WY{MbUY0t2KB!y*lW z*`wRo2x#B|p#Z(c3*iYX4({YxB^&3lh zzH)07(p+-R=Ik#(2gXIi&!i@QFC&y!G`Y{_m!be@nwJ8Nw$Z!|m)#@QF5-`us4Z}G zPiUau2rS*4DSqqcNT=_~s`MpQuMm^qoZE;t0vz>PP&i0i)`dBE)mw>fr$gLx+vwby0t*|Y zR+fSNyXNOB%=2w`P^PuDa8*_dc8i3M^IBGU9Ums=rjLHT_x=T*JmtOYFa*g!9gT8Rm7ldyIvlEFQ5MldSV z;DF*l+j+mSf;pR_a+zKtaxHRp8}nJOh*kUxobx6d-}5R?V>2*Q8XSg3WgPr0v$S3<%-+yF2#!NCg4wgV)%qr25tk zj8xZgu%1#lk7h0>yD%Vy5zvH$dyyfBI7v&ngO(3URH)r^in}aO4BhT}G=%~tSqQ?` z>anma#mNIzYErCcBqlekQbm_b{sHURWrqaeN}8M6#k!K6d%azu3JSbq1rW4HM0SL6 z@}&yf5e^XVC5a(undFnEe-m*^;#^_Dc%Cfwxhd?1=d zW=azs&mrGtrt1Bxbl{fDwZbVrFr5%!O$;&MelOqzdbeFx?(20fbEuP(YP?+NO1Q#3_JWg{VWHj5J&9#=AtBrK?1cCg(nswc)uM*X;} z0x_tFhc4Gwjyl_%4F2qbrWgwkvoR8OD*#hoAyBfdTQrHmGqsWUiU?}OR7Uld z?vH(ngZ_mGwWxcPc`gW_#czmkjLv194ylf)^L!;s_H4?H#7q~$B(ylZ5rShsbh)fg2_78_) z$v9`a&ifMOev5mn@Ey{t>PZxb;EL$$YvO6-$B z?r)`s40S04+z7R$v9_1Nu|Z4+brm$Q+_p0H6NR5^=y_2C02)CxVc@=Jss5g&7KpTr zV)aJ(B{!Ju`}|d##&a#ka!+TQ$dCa6Mi*&LWj}VsX1^Ukmle1XEQ)-OOb)C-c7@DV zg!hjdGT0|3FRWt|5iETR=PGc6SGTlcXki61_~EEf%jCed=nK13p|S*VAGXxPb~ zx6-3MyA6|;#i^_+CVIeZ5mV%p0o6jg&xk9%$J6_~q8Y`&$y8nRqThpIPhXUy=j|kj zd)>?acA|tt_QB*o|D)~QbSj7}44MYZt0jeK7}SgE>G>HI0<+D$f96ZkEW;$|^rGOO z8mkZip{e27dV3)Tw_T0BpM@io12Daqs!P`PWe+0!AUk+&rPhVXL)b96>{|hXkGTv- zOZ{H96kX)J&+S{(1`5GZK-s15aYci^#QOnP=`Fi%{Q?xa>al|7R(N?3Vro$)2UNCF z(aN-_m;BAtz;tR@i_o>MHWLY}+#*+%#AmiZX4J9ZLg<>5e~QGS_FQzF_74_wlOBag zkz>)p6Z9pL_vjWF2^;Jg;unmS;1EdAF$zn#xp+o%#tkC^5I1eIza&-+HDHRjxM%GtW_?;Fj z22*CBY;R97`3DCn49jQ#D1- zq@#P@<~>M$r|Q;Ht|iVZIs`ZklUsF#KC`JnB?_dQ5=}Zem;(8eFGeUDZ=(Qq zo-k{6DE*}eWuXUtcuS1H{x+mQ@KU>!Z)o+|9TXg}ryzoQcW{H_Q~CXJd;TgIO>JRr zSi#Lc;bwp_$h(pOsuL7@Fm5+f4viAf#goYVKI1V3$q5*tjv7Wlbv$I^_;710#EM0Q zpU>-6W%q7QT{dPbSSun}4C!cg@6IpZx8M@yKG;`-frCzGycF6wG76( zd8IT8IAHbhDkNGetP}*)PWjH#t@W-zuo{NgLk>R*vQt&q0;T))rfLS6fO>$fh*yPM zNh+*F!R^(b$MG-1z$iiWq?sMM|5PY#oXD-bq_^TlHJ8|;;K0W(U*olY0h!b~3eMQt-|+H#Y=Y8;%&@^ z7rz1J09uo>QIe|aCrQOkvV1F_XmKgMag}6JG`Ggv8!kXw${*PghhOchci1(ECso@7 zVs09{KCl@Sjy0EEM>PUxF*iHZA!eBN7uTd_lRiK~2wIpN3fD3yWC58B43y>p$dL&K zcNqj<_D*k7+Lw1SBCBG*()$%&n||ZI#vlhk#PVrBeETDOg*l-@s9nEXlvpVONX@z} zWjK(L^|W?G+?rFi7oIOYl{x}A!@mSn+KVSN^*gV&8A6Eoz~`nyx@exlB3=oa$HC8} zCj#b-_P92ct4<%CF<#TU1gHVMW@(m?x58?$yC~c9`O&a;E%)2&S=`iY%YAx>J6~Mv zX!Ckb2++qzRGKj&6m7`prRRE1c6p00fu<9_Ypn5K3cv?=k0zaP<}l|d;Vga_T%eQX z1p1SGpLo&*)*epKW$!y!$V0mc9M&QJ#Q0YI?LR;;rHGo(rZ%~$XeqD4JjS*22;X!m z!z2MxTS!AQ3Je<@&Um!x+qOh_@X}~$CFvs;jQAm?^FyU;GbZkO0uwsH2pz-QNfzaL z*4baB+l;c5->&Y7CUc z5(B|CBqjlpTg+I}ubex-u}861fm5jRQJ9^Ch_F0cDdv^Y-SEnY4IbX7PdnY`Pm3wT zS#PFh$GIIT*`=3#0=lu-LL-`J^=YW0J#cWIK*RV5;B95 zlpUSV0wS_3TCiY1B`M}}8}gcefqJHN;?!I>nl7x34_^x!FBjV`+FYghLLSq0ZPk>q z$C#%(Ij4zIsV%O5@*$m6cEtgDWLR0tFq@P^i`rPzI9rrk5_1vud+;No`$PAg6WF8`c&t zaNhGitH@>Y=CLg+C!YQ_JTwz+iU{{tq`b9)Zr38HU9z*EKZxM;t46{pBml&4i`#?! zZJ#+G-90>pr=@|L#Ijw9`jQdugpT8Nm6M^nA z3O)&AHur4>hQNhF?_+GczU&ndlwF&u0SH76e*$3uT3bKWHGzAl+t{2eS+`d!7}WN* z_Y5H#up4IOo^c~2k$DyXwiC)xdRH}}_5irerRLw|4bb+|!f3yj8jGfqoak+OWrA~f zdp?^i*{RJEhbzPA|8`d&;QD8uE$Fi}&f@SkSyL~-vv6LRvnB+t33{`_x+uT!X3nRNv+@t;n&|!zGVla%%y1MZ zrR9mDL+DqWo5hhI2~d0E?9{G2`XN2utn$KR4BnrvM-NPjrGASRG3`!+z(t0&N*4oz zMO_2ur*pKkT0Qd4FDSqWH$%KAaHqaSU7P-EFJWdwvISH-6-s~-0!v&>ZzOIX(k1H> z>r@M1f?4eHcD6rQx`-pSXLC?PE^Mz8i^KR1?hev?;`#odT}SltHM;b@WO-avC0@Y# zW?kj`bHCrk(CLp0+@$^*J5Z#^TCdYe#mOSoa(s_I>rrhsqe+~^;1(gBxKX)P2GUS& z_5?@elZ!(hRyCQ!!#GZwjA~7v7wEz&rx>`g1QUk71{V?o-KfArB+H%8eF%*K7|1eh z`B~x-M4LiRn%BV(5-Gweo2issA;&i{wEDnIokXm`Ry#hl8^qQm#1N?;O$z3DAOzSw z(@y0i>@Tre+esHrFcxO-qHc+!W-X^X68{40IQ2VO*VLjL1UkcgJZuz+?@A5M-i@&T z`CoO*)F#z0APxO&!Cdsl?HnwUayfttp zb%J?rk=846o>jK+E7ghnK37RPLVmp|>5QU9Spm(|twk?U_rc4aG8K~=pti1L*S}u7 z)LZoy!EGS5+4{W=Sm?0^Pol{|n5E0TpJD=1t!6dJz_2!B`D5{NKDZE9Vy@n>#){#%={2`YmJ zA*+(?WNRjA0oN2$2G}Rq)y`s_d6{~1D4rn`s@F;FLf0%x*s7C6a~G;ja5s_Q)obnm zNN(bsOZdj%-2IwPn%e{u6>!mYxLydIwNRkPv#u2vS}t?M!P;9?@CI21xagmXe^@~D z`Wy)cFE1v{osjbbr<&$B-l4-%TJPRN))fi@m82{Ua#cUZhDpf2emCYg24N z0RRW;GTJy}zteR_6LC#px*S4SxfIlf4ZOto6-9=S6)o>`C3`lOwJFvx+z(-*ZqdvwM)2{cDjron`Xizl)QZ6$_=2f~`a-w6^&7^4#>^ z%!{CSNhjJq_jCq}Q`v8&bXnn3+yD^%Nc!t0xfaRN--UC;+HF$dsq|PH9~0K# z%V9{`djagtM$>6Gd2$O_u0qA~9_I-~-FiboQI!`eG7k}+f^CJHJra0CSy9m@Ftv9d z?&3N7=7}oS$7XYFV~d1=LKt&ODCsQdV6D<$ErIbP#WYjK1=gjV^w^C8_M0(LKw-4D zP(bes-4nFpO663tTn4|f8?G6)YhJ^~M`F#on$ghR(WwPNy4jyh09%utQm1Ul3 z8{w625&pzQxt`rlbEFE8l#GjmRZA-G>h_dHhns`#?aedMKxD#%To#yD&-$!Wzh)S1P%vwIV8PV;%)clXP4OPP zCe3Y|({yE(?bH;!&(hNTa=M%Gb+T?N@s4KwBvk%rn@(W^dtG=xv*B|RjV9agAJ>_A+!87w6lPT=E z+=8?Vg>)+1ist-1^LQ)sWKgur3ZNjzNK#&zXX#!}yO&!|PX{%!aND%X`?|spBNRJQj>nqS4WovrC zkeg?i-aWPME6}xcO+nw!a_0%$wQ(6WXU=HKXZ`p0?2@RdY16bc=aMp~jrULQ4zk1Db(X(` zo&vjX{eL|DvDGZW4l{`1Jx4Aw5Hb)$pdnM|YwCs0f4* zpbTWpT_teueBE6|5CFknlbqvQP@xA|*I{o%{AooUhal|KCSjr*!qYUrNsKhGQIuZb zvlDN^QzP1O9;p3AAsPH8-#0_5nC1S;Wl~zKixA%`Y*HBRgk4P*Fs3KTVjvdG5*SId z*@OIHUgc@6ZmL=amX1sxP|sGnt5~R1Sgvk=wXEF;rcXpPXUp-^s#wyu%7|~P&;nef zPDJ^Ywn~KuB(_Rf4oQQ>`Vle{Ln+(k?H37lbY`_Id&J-CnYLV!6;$gg=M%9NuWL3d|$y65Hj< zO%_CV8)}PQp4=dnKq(?6;`w9YMV-b8EN6)=)Ql}KK(;czTuZD2-aWASzzydym@uXKP7E5PM z+(SNlHAu3{qPj#m69)50bgCR^pb*>8oZt8t;_ZRPsDNTY3@#0at;76%+z}i zvv=^meR=z%d!yU)RglM{;*~)huX}i5GDmZ z24~%3Vo=M62l`RR0p#~fU#*gh##jHg)NFjHx`drUbewlj8nKS=clDgvOR`pVk=MaS zKDvo3T$h(S46~be)(+2JDwRWpK_xgz*yo^_=P z!2G6(l7K1dX$yA&6_J>Wnj%LRE|k>BceaT&8z;20$|YAJakcX@dxZFX2S>%^p8CKC zx=21a2s^?@1g)3YXLb|4RZ=b5F;l}2MU-9HaUtA;R%yNqe_+yFl7*v99bZ_C4l3pL z&Z7jHk2Hk51KG(CoSpf4dxK>m-Vv+S{1N}~6tM(9;btWyV6A&=aC~r_zmVw%X+F3* z?k#dPuF^#q;Y(~|al=V7p?9b85SdMv7PD;@NXF(F%l2ARbu6Mlv{nmlDh%0pP+Wb2 z{P2b_c(kn@Vz;tp^^WPf{?Grq3M*D%Fm(%a61qVzNJ6(=c!j@EG~Q-F zmloT|`*Ad%w?JJHE6NEID8H&OI4?85%1aB$$*Y3nsUZ9L@8e2b2y|!M_ zQkJ#lpWbg5E&dg`jXCC&wzEP)L?#f0w||o`pXV|4vn(nwirVNZ<-+f-vXIXhX!GwR z1^a0KeZOJzEuTxcr|q)#@I9kVgJvy~=Phnc4iHme3|W;q1O9|XD0Hp0hgrD67-#C9 zo3mS>G~y;`n~Z=h5PCZUK9{;Ghggveg81iuc3a4v5wT2d!HYbhme}HgJ<7%5-}R%kKX0+#vbf~_7GyZGEK;N2#8(K1o%;!23V|!3 z00hKR(<=@A-=Uc`nIV}x@FNGBHJW?ubHPL=}t!3so-ga%m^>7YE##sY__ht@f?_ZKH_I6Jp z_L&4o)QeuDcY?3;DipR}4t+rypES10pNikJ*8T{n*Y1jcs>qs6v3sRJ`bEhn^B9jm zEW)GZBCZmIr-B8Tg(3;}NZ|g=2o*cKt^x+YLK!5grT0Nc`~!?<Q@8&>WL@rwv@xH0p%*muS^sT za(i)hDQx8o1%gy1$i-AtlY>8vV;eXN6uJJ|DE7t#4MAA#h3LDbdK^7>S;PY3$MfV9 z)zMQdC=;_M z-&7%PfrDVvVnURs3un)+MHoMvH%^^mw$Kx;{<46eI`TampQE6PQm|GcgnpKBga7$o za=w2b2|8~8{NwmS4Ub_a=iC3Jfdu@ADgjm2XuFJx!9+|Dp1U|QJ;nMZF1mA{LFAfV zXep$!9Xhqz8l0&pqpflx@S?;C2{Tl!lg2)VzRbr1l3AWbG|%wjGo9%mpa9Ff!jG7% zZ~4qw0aFOH6`a3z2OE-_^qwr2eUG;Rdp?++18;Lv(f-CY$RGwGr}TxD`>psUP639l2;|CMh@#$(X};m zpw(?lZ_{mV#1(a)T`_J4#E;=brdHzQu~t`p))g?=E?c*eJ>Ng444|-KZAPpFRhYdj*SseT-wGs=6rb1|0`YTtxwY{}r2-H2(HdOELj*1~*&u(+i`(Om| zZRDpD=3F5`4&*KOY!B3L={6na@qMlkD6rP~jsmQJMGwUCWsz@jeL~@uq2n!=dyrecnzeQc~ z9#~0wd-St`M>2k`ae#dSSeiZA=55Ac?{<(x=|Nbuq1F397KZw-^U3W7-@lcr0+a*U zbvz5B@45Iuj#~-RZHc{OsEX&ZW{v=5$o`v6DRy$m^$@fB{TU&4BZyRZkt7~-Idlj? z`?HKqlvy4uPz8wLPFQ61y}5NXQi_nyaD`{7uAop<0TPzulo~Eny?@kbWBk_hRB!Ya zxCX=c@(gXfUBE)Gp?;WvKpF-X;Kfk?9LXgvxYSa2hG_0`gr{M#Gj#M_mbU^)F#Dh! zip5-!>F1JV4!0r61f$QG4POvNF`XfI8Eo{N2L9S`Zo#E}j-kIyHDQi$_CAchn8;yr z1D95atX`R|z}<~p>E-?!f-qjtnsxZZY@hrli}yRP=9Nlxd3(L$0IRhuJ&P~38y;zB0%mfm4rw}hX z#h@#ifymdnAjyTf>{|L}NV(|4T3hYSNzM$CAe*GR421_A$vryQV#SUMpRyYy7_M?K z1Rj$iwTy2GS^VKq+?_GxZ<3u;OxRX#h0GyeKwLYJL!2CK^i1tA5RB#OEk9D3-rpt0 zdX^Tw!{o{3SP^2?hDtOh#Y-`&$DBY#{8#<7I80UWn{x~;v~s6A6>F=$+7jpXL+!VL zT^HFiyyv`>0Y5Gv-vgA8UY3H|E$*j_eTyt(u|Ul!FTyBEgHONqf}6!HuC-^W%Q3dp zraIuAdmz-%PiSMH3b8x8P{kBXX*aeBc|w<|TR0Mx)%iY}p9oV~F|5+#Ikx<)FB8gH zm-D0&ZkBb#E;RO(v%WAH200hirF`+Jl`#p@W8S{Ha<*;lj7Er4M%nnQZ#-ivHT(;8 zqH^8&&VBl3RXsCZJbo+tC={LbVi$FBnx>$=NS@O2S8lysaol}Evux={m2*tq_F;Cog7UVw)(uc3b;kZVA`J0(Wn3ghFN!UcG(-5kKBlm1PCnSq6m6{meE@?N9F~YPIsBc!kFH&{db^?WNCqxHS>$ zk9saZdZXv95X{$uoi3Ki~xlfZMf3EjdCUP=>|nEhqb63ox~= z)Z~P3|0p;Rz_KpWt|^y)Z_`MbNot$p{+Z25qplc$==r@pM)6}a@Z%-RNr3*xdk6LR zIQV>j?Xun{7EQPoO8qELzlDSEv&5*#0TV?D&3wN3r-m?wNW@rG&h3qBdY*gkkA8gK zXOaTft-vAQ2LRvW`2#N0Z)f{I|H~AR%6*(OfX+>z+$M^5p7n8_U1^y;9Hf|4ycMLjnC)q!M!cmJ z&KH!y5OWOl+i}4FWq@{t1Y@Ny%DqZ?P3%K&K`$GbvTA>X7!A~xEf^_0AJp&!KhUtB z7yQ{hSh>gPm4k8=nhJnX4RaP1(D*r8b^ns2oPBa14|>U7DK$JN|Pv{dSFj-62rW{oLbveMw(j2G1 z1)52rrtsS9BpblsAaaoDFQ1Y zE?ss;ves0Z9{%7#dVZ)y0JQz`8AoYk`THdq)CSLT&HxD{zC~L0W6NMUK9}SkBh+me z{XGVZV(NP`=jIty`VvHpv!UuMEq-KRRQ^1WZVf}afH~$b&7ulJ_za-PKEXoRM(B_^ zQwu|3YH%*kgnEyYGwcmWxZZ9rx6ONe%vS<03xN;@+I@y2;@KZrCx(jyi@9ZK6!&O5 z)Ae?Wj`y1}QB8?M_s|VqAqC*?yUo9ox@4GCdgmlCw_*LLtL9ZW4P*<9 zMZ`?n2I#-oU~1e*ZNH^Um2W|TE4Dxc*ypmsT4r7IH~k*CF6sIi!OCZr)K)Yi%%mnQ z8;t&6{}&KMFvXwU&L?2pL&9liD{W{kVLsPv3wWPkDcaSJdtfOhWRgqag0#3ynOfF* z&m1QDHhQ5<^_W3HuuE@m!ALYUEhTb%-p)1<*UsPn^Z(S@PQbXx6~E6kk=s8;4~{zl zO}PRSNPC_OR-mb2RI71=nRah2U9PM^4hp{C4IuM=--9fbIa7fDov_JS0q53Vi3p95 zTGe0+GB`zRsq{;N#^+A!_CjzWbo>mrLyKmA2?Bk6rhQ(*WNM*#HjI!|4B*#_z#xE- zKYaA8lr5u%5tQFN7t%G;tDZ`@(v1|#O#22W(_BUUQ90i?Bm#qVi$A8Y2Q)Q(`RsIb zS;_vm(m?(%mS8pZ23U?^KS5m#pQgP}o%t~>8=YLzjSe~oJ){sNj%4_SA4cKfN}}u7 z#t+FL?=^#3F0&VK9;}SQX|n#D-)e9|=v6aVg&EN(Rl6gGqD$A@ei%VoCC9068B6En zYTxDxt6f;VfKtqj7@+ojf#r9`wQcL#tYTOT|AdxWVt*3^n)j3!*k1zEtBQ9J!Z_%k z3nE*ij$vG7=)KdHQ?OZ4fVPAn`^AFF-<>%-TCEi{Adufk9AymJEZWp=5LEt$yxxV1 zW;O)GvdW!VWvNdALQ)2}^^}p>_NwR$U^Dzu7V)$qIN^qBW+fqlrMsuVgq;V>*eqN$ zx;dLY@snCZ(|zhgPXIy&SwPe|C;#^{dKXwUR@(bf?JN|r6H|2vrK^YyiwbsYOT6J9 ziVEq$Q8owM+}g98wL zBlMet0Ih7TGa21SB8C&=t;m#FKK4(11&2{D1!a|NN-RElDy?%sifOZIz?BwWNMP)j zHDVoGIb4O=s$XK8T}HO_#D%w9b|I=i*%Ps|i?cQcfjFcQVg~jK1|K`CO(y^OU-o-x zm}Ngf7J&7029=?u;pS@C+0tbXG(2yovX#td>22cIFUyGbaGlQfJzIf$<-QbPhWQzn z+|!u@^FA(*>WVJ+BJ{aub*mjqHPQHl;Dyc~`nEy*ES2I9?>zT2aW4&8AyZg#9*q@+ zkzGsFg|O{VFrlmJzer1n4vmxLGv}Y`Bvi z*DcURYoL{h@LXDyr?WtF&{&htOfv`_C1n@>&HY9ST!zm~Gy1--0IiEtG)?}D)&v0D z-g4Wjw~#O*?cW2H?Kbl4``Oe!m%Y;qv>BPk=+yVDS)-a~R#5FdD{E0)h9ok-u!z#K zzb?VeshC!xAT>D~F%Xkl;O9$7Dq&qe;qJ4MpFaYj5HPG)DC|C=Pc~Zsw@hsr_p@dj z?AHZZnp6-iYDvRFaSvW7bSiSx{yqr-UV(waAA#*4o$N8Ak=?{cQtG z4dBxIEF88IZE5!tBfn3@7SZ?L*C-R&$6x}PRNkCr^M3x@eYFdotunq$!AF$E81L7h zh^w5y@%)kqs44r+zTkP0=Fbi0AOWW5wPJC#+_`1EdVv8LLOus(ZPR~C@YxI-w?GCr zz+gBP@Opdi--|`j;Y*xB-IfrzcDk}bW3XO`E8>^00gWZ}_ZwR4yORVPOW=o4(-K@< zmlzQ@7NUEh3Rl&@t!%dVi%Ly`{OB%fFKb+%riNAeoFKmh->*cNxQ%V`f~(s=nh5*C zn95d)RfCQfRm3)8VpYHYV-eHZmcT?@ynI?w5A*|I`(#53&DqTWKL-+PSGAw2kOI8t zS6!VVQTpOZVReNsfF@GKrKtz|a#!3x{MIKiyT`4nayd^Q8ula7EN%76hGOiaEOzm>}=5>($-RkiTDY@VW_Gc;$y+9dlWheMTB88FKaWM>W6FK;on zwsB_b@u3y{Gl%)FFJ=7#I*PDLI54QjY`G$U&oMS8S<3EgN;d`F#NYJ69ynE^(quDN z)#;HnAlWGsNcxLC-2VUG8!)1AG(7=VE`dciqLd5lp-5T(|Bv(3Q3ffl~VtHby0;> ziLM320xWI`z;V~(V$S~o?2w!C$(!}8ME!OAodLX}2~4=_pTYOa2XRNwPA#^E%Kb~2 z+?9Qg-AH7Kw97TTn0-<2NdNvpigqU-mdQlO{qp^zLT9TL6foS~YAUQklTq*vsi~&L zX3A;Q8cAMq$=PZMiv9H}FA)!BZkT%~H^51)&B-onlVh%2SA$H=8xG?|%|i9LSPqh` z<)Q@E#Jq!W_V{c<&sSmu#R&ozx&G}K1N*EB(?H|dE`j+zRWUyyyNu*rsTBZytZgX5`f*!Q%Rgz``y{dA! z3<`;l-)E%&Tbvczowv$q-v$WWzYUUJkb7dTQqJnxw`5iBvt&%VZbux!Ac{--S+lIa zCs1O@DJ@U{w(S3&B?_i{gyU8-326t~RR|ZlA@p9meDsnae^>R@fp*DK`Tn8#+SmZ} z7U&neSTpS+P>fiRnOyGzyihS5t~C45u(Jtt;S4x!(N&Eid+A;qo#r0whS#~0!rh0T zM|j=iQK__AuntKF`4)m09aj(GiV6DGHmBG+>Bq(xS#WDN3il|LH@euin9aQ%mi3=x?UKK zi?<%YC~UGQL-y5FAf*yiX05RUW%iWY6Mh?Z6r~^TR{`aG*HG`EzJ8Aq@esf6zgfHi?9_X4gZ}FZH}@$!Hc!%SUf*Ks+B}jM+NxCbf}>Kb(?l(| znOh)|J#TMyo8@*8R1IW`^2 zHcy?D(wTQ>;HqpepD+WR1y_hydxEf3e+KHbeSWt!w3$b)b~tb1;kr~pFerQ_7OP4Z z^o6Gp`piKE1PfnksUW=q!!PI6I{f9ViO(^LzJ=}q@8aW~Gsu^amW9Cg_Xw+T9A@Z_ zZ~mwX>Bn4eFpYNx40c%kQ%t$?t;64clKM@G_*U;{~P-=g(H7@I|0 zs||KQB>s2wl>&}H^sVg#Kn3^I4gA+C4FZ)m(Bu|cP^Da5bG7<8+5ITVMy|V?GtK~^ z$lnCf29N3`DBvVL64;Fp1VBRBdIl+EQ+%0_#ws=98!&Tp0?Fxn0%S5jXBo5q?ar1E zgVrq|GvL7t<~irm&#g0_yv;C^^EG(F+0uJ{U?S^3m1Y1w>apa%=2tbeH5kaLh(?>I zRv3Qw9tIUay+~~fZq@S0zmkl1bnPh(4SKusMX~-R>;m3ff1&H{2+i>C53>iJx(pil^>;cyn!?A$?Q zVI)=tVHjTm2PV+TJa>?EIItDQYwkWRN z4{?l4IHtf^RQkh=#S6=G`{Z31SMpK5opvK>B{Omf=83J()>Utsi>`O32%If4H#y7; zl+dntHXAFVCysKa)`G|)fLY8~(ry^ddZYJbm*>Es3qm$sV~`SRmw8l140N4Bwym_B z;Er&Ms2Di!9to;Zkl7%I?bkUzK|t$M`u+e5xzz$;UExA0C6#8L{~}gAG4mwZH$S82 zQ+_TiKGT>7RRw2Uihfn0b{97TB_$Z^l2*tTznc%9v-dY4N_t9$tJlfoqzctNU~kqt zLbAzr7F^?-h;a>{XR{5^NnN1EsOe{Km4vza%+|h%@q@ zR`FeVE|%dCdjOXMD&*(L>YPKtkyELEic_hq!SLhV1J}x@0&0O8U1-%!)WQ~UuR~B3 zY;5hDFzA)`s1Z(qMVQr=d$ZYl6W;Q}NK00V*^gvf+Jfm*c|x^yk2}fuhA{6u{oxCH zzAa2kbNEli z7%=26!y@ZL{B`ixMoNYQNXhK-xcl2-gt`j%W z!HZfzlbIa~m8Z867Qv+svt_5RzmP2I`x`O%2(?iG51qK7V*85~8;RGBpo$~c;@Iz} zz1R!$5D*O*5%cXb9-#Q*+kNp-OwZJ?=9H?(AS$8KkH_l z;vg#5&w8uhQ{2jW6|wlnH205cnB^K>?ji=PGisojQ077m<6G5%1bw=`=H!a z;O~mLXz*}Ih&ck73ND;aO0gDt+!B9+oIdiBHP{JDC$sP407#SLpN_Daw~p&xPI>^t zL{8u`#$m456S_T)AFe2TN}AFs*BMF9c_fvc{V7F%GE#h%>K@ebBBaU%%7yiUgdff= z`mnun4BkRa#zQWt-7j$N0ib$%gI*F<8v+{r6D-4mlc1n4AwSk4YF#zEZj*VA!>MXa zr7Q*MFBL9ofFKW!E0GF^^YBE*I;1Men&ugpuXqi|Z-&nsiWfxG!RW1H5Wy5$UZ`?= zvJp`UrfiSLnAi&x-ZL9%>mj(?7rRQda0#!_|E&#-;T>_nTXvO1In!kE7pH(FM8aEP z0I(5XM_cPfPpeK*0TSFyuk_>YvqBxp1pW5j0##hp8jts{%ClW)noba}2g<&+NG3~d zbMI`2XK;(t1gq*VFPObx;Y8`pk}l~A@*QT;@yyy0S z-g2XcaS~%vy$sSUIfr0*ezC}b3gsl;l7pI60%r+ZQ4e9mx{a-9%to(_p1(K_V9Qn+ zMx;uBq_djLAxFhw1eBQ4tZ^Ew96h#l z9xQ-S7AF5{nxbr?cx%p(wa$O7g39%j9a1A~0m_p$YJV-0WM%d>LBlY{b2)>By?)msGu|-YrcE-`fZE9 zk@Buh#Eaw+FCadviB~hufHiZHGmo{M%bkm+4eHmccEBU4HC8RQ5OUX;Lx+XVQSKSA zCNCi(ec8M$OVulzEV+Cn*A5g4u7mVOn8>2CrTRkI9d5@5Wq}f_3p4fOIT-GDeC5Xc zcy4Ar&_Ha%qWdP(D8ALy0*i94p_@mQ48!)OYUEq}vQOg_?4w=&Q=eiu z2#G}h`Ja83oo_Schhn#-TDzqzGkpPz!=Q!J#VmP1Mf8TFGmsVlvVtIRjo-hSq ziZU1kAaPfAK(Xwe%8 zC_75y_Iw702n+%Ca8xF>2_a7-kby|>o<$(%*Q-V~M9m4Z(n7SAnq$aTLIP#o((0aX zgXixIh~7vqM5nn{87s(?Ys%?SwgmfG4Upz;i7y34S7C5&POfT2lgpX|ZxPF&VH@b+ z9!7=>BSYZisrL-6j!nq2GOC0LR!F`7Z+2xWn6_&^i=2#` zzVEDH!nGa#NNl4Jf|2zt9|SHWwEI0KDI;M7qrKv9ud`7`=aQqE@a!4);^bRDR+(W4 z)L1Uj^;g=)RDT;w)KII*gEdKQP+L|Kpm3i=KLfP6nlyiBrfz;RUCvkmsTHsx^h;G_ z0bA2Q6N3n*$8ZVD)IC>_fioK901lGobl?!1$7NXqsz%^2+q7a74C22m6Pv?eRj$F0 zC`!$_TSXKj0Iezd20VM?LG)3*X{;(z?G6VQ^Ad}T1symZTrv3#+w3+|D7@~w%Dj9t z$3x_}SBTY+x-xc1OV4Bsv@OVdE6n1Q8g~7;hTKyaxS$wi3}URTf>;Y7;KL#>C-;Dn z_JcSRZv2`jUM+>4u~XFvE1hoEJ%4gRD!!{%8}Bthv3w?t~5LsF=o{E=nQ;AH2ryi-Jv4N*5dKNlHb6$=l~$yN|( zF1T4w?XFy^TfvYL;_wi`y}_+;c9uMN`vvs~3%gQ`*`Tra_LSEEUdkD{{!X ztrwNl=6QjtWMn+Mgv<(VC(pbOeS4p{JY8SOqzsw21S|DE8)&H7$S#D@6ixhIJVII* z&xU|I!T%xRYfTaBHmHFxoe&CwoayC#U}!}r{A^aE0E9A65^5_`Tl*W9w}@}$xNjiZ zo%j4}p20a=FCp^&?qwl>Y;oG$bETj#U`*oqmXV6G9gDQV!dvca4{P%Gf(TMmh`7Be zL^jA!18#94a>Q73tlx^3k<&z6fd7lR4vmcwC%9%2;i{@;S5@Q&UoLshj=%z!uHN?b zN1?&Ovd|x6ZXdWrG0W`CxRk=6FWuNHsdg?GOY>s?w_+Exi}8MT^PW@k4#YS75x4^N z+$u2B4~cJ456s6Y@Y!r>xm{j#y&f8tKydBaf@RRN?3Gc!xFWqMuad2SOGSPtI<=0P z+H7MpDflSDa7|9v3h#wrPJTrvRTUD@pcRIncX8SH|I090KI&)U0URuq9GH~CVb2WTy10=pEUgG)%+aXphlaB{*o z+2Z?*tA*W?RQDb@v%>?Sw#Cwllgm@7Qqnnvpy4oU|MaGZu&!>99FDb*01Y~7aqTKT zA536icM0>lE5H0pS&z1x5orW!xne9`5oHguKR#Uxq>5fWcY@i3xD|w1=E>+a( z|5pwZrd>C0Em_sfiscCZ`QJAc-a-Hca6Z?3)~aurfMC6UZ-LsrAu2K9-jcU+pLJ^r z+*+FS=PM;Ner;aS&Y`CR#5F3H$rU$$7aD=Lpqr8bq`!}lTcMUvU}7F4n0D1=s@9b~ zYl`%gdodqGu{#zrkHIXS&AB*NOe!na2#h-*<~Xocpd5d8E9nu)S5%;7ru$9bXEdyR z46LRCC}Oxj=>s(AzNEMcwuF0J(^~rd;QG&85t|Swp~V*ZAdQM1p^ zv#gk*3p8U5Yo1LMnjE$3tq?SqsD4BAf`nHPmrTVj#O$f}`O(%HovVGqTMH23PgRFx zuA)Hu^SR5eWg#onR2F0yXnSRV(v{P>q{wZrrM{>YZ&?*A^SlWL&W+$-lW317cL)_NXT1*P2avPfG_P_Q_->#SZaRimCv&iRZL|a=l z%Cw&esrMO;OtjtJV>!Q=V$N}NPqxlmfc<$<%erR)V5s?9gedJEZ`o}~xV_xxZq=TD zU#r%Q?FGpEvWgZJMc#aF`-l4W2g{gehAd3-Z0UkqFxteN(eSy2?EpV|$j82D&NJeD zM$mH9@7bJfYXsNkKmUU?bU)a1vITl;O7oc?X$54OnEhN40Ln|PR?zS4?eGd) z)J3_6MghxXz!7{#33><;NiDSs4b5@p@fqsd;TC@8U_4jp%jZJw z4I0zf^CfElW%=7kvMjzof|-PCsklJ?ID<7h|LC4Tt~n!`kPQL-b*JuQL#*$C%^%VX zvfPh#$y&N>_WOhy*MdZ9pZmjg3=9gZRbRH}Y{=gFU8{4z-QHLdg1Tk6fGNQ>DAbRi zXY!dPtshIN5e=oHq1)SXZ7|Bmi-60|gqj|cfOo1MLFCy+#&Q< zetyDUnQ?8ekSOJO9Q37(vhK<6Zt-UrGj00_bhPeREA}f}q7^fUORK0}#b*KC>IL+k zId$U#1Fl$aZ~xAPmXvR6sid!K`&Otajg;tWAJ-*Pq3$-K)W{uEqxo#FU3&~-&!#ab zzLG15I80=qlK8WP(@P?pR<2L9+GnXj=~fuJzfE1D!YN}4XAObohK!;qZlv(X4VII$ zG#IxwgbeOk3tAMGP%Hws$pN>b#Of8_>bF;Vo7*+w#!pOT${{lx*+n&}%t-^8$pRKQuGQ-s6{_S+l@9AmI z6(_AFrU}Q7V=+*;<_{mF--rGsyJkMaEJKye+RNU}?H&(m~Eo z$so4m!Y1v;{i&BBtp&;O)(`aosUb{q-Zh_V@l8qyY1Z>is$1%7{eD$5l^ z$os)PbKtSk_wV*HQ`e=pz`-p`<|%Yq!50ZnqLOc!p^vK9ry(p#Z|xWL<-zn7r3 z5uM^PiYpa8u4|9)>_p0BtCmKT769v>Ktg}PDz@g-99FJZ5vad8Q5;lH7BRuq^dcYbr!;oadR zv)INB%WFl;{g&2r&{xDIgj~8pA)hHET|0tyQl-fzuMv0(sV|Y;Fa#+!FKPgy0GkAE zF~}a!!Ko5VktDFjd{@ci-2wa-POsbHVzw^T8yf4KNp+dZ^DXi@6$nri*f7@$cWVmL zt?IPcrwig{!>uoBiGplriz+SI{3gqD6E)}`FF@7JXCv+_>|!00Hpa9e@?A2478{X= z*WmWN_g;gWUzMmD=FEV4%ixsMfL<)>sC3sSn?oX(e)4BpZi?&27tKS5I}}~l^>!1h z-N&WWIBiu~zwx<7tQ{@*ty6nD=z~IiD?yN3NQ&SneDlKr4)_z&)C5ZX%@@f1pr1ba zj$bh}ezzV1*b(q&9eYdmOB&Q#n5cs4{cBjj7JtqNSCkzHYf9;bbBQCmHDR3}zAG6m znj^)R<2bk5F#wRx|2<&2gN|dOj(#eO8wL;iC=J~ zC>42#4&dH$kBBTq%9&!n7aOA2_AJv?%2xPOX}G*qv9_8c)IA{a8`(SQdLTPlKkzeZ z2vygA554piF&w*kNtwPJJRK~a9io9RRF1GP|MS0XN}k#t6#ANY{`kCTFjKgn^^M{7?Vy0P`%MfSR8iK_5E; zX%49%#gTROBv{h!1hkLMv~|he=**lYgFRM2dMO4`eSbnKS47;CPNE++V#Od`0mtV( zH7;UnD_W#X=lP|;qla5$vZQ$F&x_KgFAib;?2<0679hw`SgzS8j(a*t-?sRcFzJ82 z3ARVTo?w}CDJQAEhOA0LZrl3D@CI8XcI>2oM+3}#;YQD<%4io zpsm^=V?6ULmi#H!?{@$b{j#Q#-qg~fS;@9#TBHLh_qV`~7%}LV&s%{elsmOaC74d0jcd>JOP)cRDC{|6~vDTO^U0n%rkTadA{;lCUt8&^tpZO${m*#PGQ*YSB3nvn*^0G z2$RB+%d(c0tUn`*o8$g;6i(k~nz}SWzyvYRGO7zVj4f$bNZRv#$uKz!7(Kmaa8|cz zM)xfHvi{!JpPM-(IAT)1CtFErjjmT_nQE9% z1cY0^C}>nnI4i0i$URTgEI3T)N+-=`O0V987dr$Ifqpj_sS!RVX?`L5K1uLmx z4_O;-Z2|R?%bsEY26Gmy;Nnb5SCo?b_}7@D@plwLTLX#%IB9Pjgt8 zIU9)J{k`5AmVsZR|19$OihizMsuH~=ml+_nfS;jJ&fiL;^KyipmuYJnwHr>$?PQ{;t5C4Od zgk8}c?OQixJn3t)QmHuNLw;VsP41rJY>ZP}{pWwt6t&={r(RTm zmo-q*ixh(m&vw3_`%_s6=7%P%65>1uk-__zwV_jv>aR!u<=QPq zPRf9A)jxL5pt7nl_%pY$7h1YjTAx_+z`!=Pcc}W&|t?-o}#CNdO_W9DIyh zHJXZQ$e8H&vCkz66DGAi44z>C@fU>llM0Q}0{G$V_d$SV_5pC^k4I2&ADV&Xp3mPI zKXsAki!zeug}|ti6`~dPyx{XK&f#S*ToFf`XY2ZzLdel}-Ew3r3X}HY^Ojd6r?7vq zQ`p-0(DLF;yIxfg*Ys~;{CzI9XL}StOJXW0*W26wP;DS^Q2eUoM743b?->v@iI550 zac!r+fusR~TKDfO&#D$h>{el<9#i+pQZP*Ve9tR!&pZQSBx5V6Xk5$MLKKh}1z=e) zutnHxBZQ))CbGD)4=@$*KHCGPvkY)9YJWR4wmG6+YC44l$SH*Gzt4~eG?Ep4)l7A`73a5tzdvx11ApHzDLVh-y zUWS>*v!jsR zX+#*+0uV=>axt!#6@WCwMN)Ha(MX=(&i9_|;BjHZl& z`>k@U`|~wOa82|;=SK(FUIcxWn5uDW^BE(PADd=`EPFo<=!*z~;C+_cSdF{tSM#&k z2TN3VizB2h^DQa@?&GcZLqI6N^I*|-D8JS6ww#16vzGY3j?T1Mj@}2rwBkJ*F`+>q z1nRFrkbhEueXh!+>$Vlt^dEzt3*h|qLYhL(@Jk$EiWCCVenxfccEwXNWrOYQpl0y! z{_F;4$xA4;!ctS%UIojd=`Hv%EZxUjVF)C3f4rsr{K|D{3;$KgKKef$rJS$PMspTh z=-l-Eq{OW#aF1Mj76PVYmRJAgt+WNLqyPASd7Ie22lJU8mX@i7bP0khjnY?HwT1HH zQnJrN+uN{9Uw31;wbPM7qqZW%kW-+Ij(Zrh-rp^Is{;*OPEW&5n;;f7%@}6uJ$tSx zw+o)SC%wt%UcS#ijR5M~_q^p8XMV={xxE2(6@trNG1i_<(zqp~qOJRxJ@{IP=G@s2oRooc8`^<-Ch*jA2l=pZLR83rl}DX=lpgBcv*qY zjxqbyxF?+cPnw%Qud>+bfC36)l%=EHz>j3C9!K%t#=kU*X1S;}DzX9o4o0#-pnNsGW?sc@uP6XQQ~eL|J2x!l6cnQKQ)iD1Iz7515mphhrrZb=cQA1>_hk7OMME_FtY4 zysSeuw{|nXE}ELt|9oP2u;}V4w*ebD4hDDc@AWvLpNjRUixVr11hQwnN!`7^a30+Q)Q4z%l%K8PsAX=I%zz&~i&i zxHTwV@Ud@ONSKBCQ3E!p;^Uh|UGbUW6p9TBF;Tj)+u@G~Sd=JvN{jcG@=|pmR~0O_ zmS_Y-BiQJTr{0f%$^6x)d9^Sb`yK|Rep#ax{l&)2K#69pZ0fbAh_E_zckv= z-L$op|6F#dkAcO*ce!$giGWU(DaFWGbe_dVP`L=|q)N-4u?TOu+1@qIPuf)BmTn=t z6Vi5abRq=%r}HR4DYOby&Lf8Nnt9BI9r!GMQMiof>_frJ85+3W05!wBtG7v;QPsi? zm=tQVFYycd;4(Y1aE!0dHJz)`)U=$hcq})?XKYxUeP!et>c(6JY409H~HqSjCLBHvm=)S&b@b95l%T z)g|b+e&~9$SZg(YOh*g2S=t30rM6N_x5V2PiWR7R`=5RV9z0_XLbsfS#iD?>MQAC} zpP6R@p=}MIKuV`76<}c}W*AA$A+-z_7Hl0+&*(6zn)xgE&Z3g7sB5jo{vl6=lO@rX z7wfso(z~hqwH49J)bj6F zTUo?^SLU3;xD!GzS+~;0NLyLKf{=5A_M-KcUMLVl1l!VLibS?wl{r6y)RfU9B;lFu z%G(HP)A%cK(+S`jNvkZ`f4?uIO`0%5A~K1jtX;9Nbd+b6?R)+nWCcvQ{JaNvo7QLP zN7WYyAahHJu)~6AfFtrgh=H7il^(|JlMKW;|18ZQ$_kQ28%+8NJuZ^BO#K}Q-cQ4~ z4_*SKR>>LIPnOfihFRQx6@DCDe-CI|HU0j)9k_8T|FRicm*)3{<^*saTu=no=z!j{ zq`~3OV5(QJlQX}@FaYaSVQWrCD{XlTNNp5Ua&9V&hDd#$C%3W8c@JExSbO|2k|bMP zmEVWDU}ecY4iN~>&l?0lF=cS-^6+iI{fZVrz9&;5jXa}Z719cPW)5mX!O(q7#n18< zt^(Gak>$WmTy0r6MO zlzuxjU=>{6-kzu?UoAmk)HX)KTPN5^o-=@;P02U#+~Q}9J_FnzKR?Ai z0Jc|J2(Ts7z^ZRe-=?D!QkRO+*tE_0nZopj0LpbkB}Goq+1j%(D6DD%rUI?}taC1t zMz&}GeM<}Xe6>2;-gb``h3!n$kZEV-f`9**4d$nOt zed06eTAAq$!37|Bj@IUxkgHJ8h}bq6qwD4)YUkBQzkQ5|A!m9|IK^84O!QPnFw5xO zVvN2Kk^LP}KTm2kpgxT#=d#|Ywl>;YHUqnDC2nmEq(NIIvD}{l0901DAfjGIdVjFC z+RF4LU5aI(CSrQ@J|A`nAZ@LzdG7vX+b|==W=!R|1co9wB?KY3N&5_gWzSjt{Vrog z>rQYI%-IO3Menv1MlpGx%n3I={7;ko5nZ|&v7+w3!Xbf0=V!dsvhAG6{uCYZo}aU8 zLdSD&E+B3%Lf=XWKH0+{QynU{&<|Pxr;?A?6wPN~|0Cq%T5tF2+kLXs>VkO-03F*S zz49I+N7*TGEFRV1X2zg1MbzqFRaWiXj0ti9Y5^S7E3mM!hS;0gOZmbjpS(4C55Tss zF*TdwUUFD{riKU(TD@z5W{vkLUTHlEi2XbhjcvT#@AaV|=v}bD;LUCcuI2}9w~Igl zk0GGWUoCRAf{gmRk<(c1o8lVcH3Y>046YN9qnkm(7!qDgC(|;5CAGkeS*J*DStw&l z5nRiig!o>?HB|srSg=)tq#M%vf>2y_tLkb~$sf^YnJwkhy~v#aeIAAyM-JUYpAN9o z@0NZwRki8gUWi{R4gu)j1p$C_iV78OnDubvanyaJKB6=qoktyi_@*478S#;U)n z`jch#hOD=s0ev>*DF#eLM7Za(-j;C0=Z^UP**(N|wo;4ER9qjMs(NeEiGLhfF}3VJ z0SKR~trfB4M}6J*$XxdOtg-~s?-N)SWpJ$~_8Mt~p=T_fTR>99*Lbs`e z@Ue+7E(DB?6VJnC{nyzNz&@}wtc;|%;;;s$1eflXJXHx2LR#IL$6LyFkn?IN#=X?^ z7FJWx{O8gr*;wxyHFKc1gy8u2)~HfU9Kqy>WYZgZ4#;x-7B=fE*PmZ2>AL$|Rgx(x z$SOy|Lj!nqu~e!`OU4qI3*0m~ZpB`EkO9!%++t;1Uv11pN(P+bo;N@0!)o`sg$7l31-+1m{5b$Dz2IzP z2c)2`{6I0$u)tNIaJah$n@v|A?hX>*L!X;i&w^(|zRcy4RE*S|l$=!-eXAHC+qWPG z^i2RKQ%&A2Ce?uW3WV2gkuZY-+OfjLKBO=IfP;Z4t8e|Hbr%yXM`a{S(O?)BCvtuA zKjb>m#IGlagQ$mL&p}fehx}FNkNa9J7?muK(s+&wCW-EXd-peHY%*N{06Y?(fk0EV z`N-F7a)Yrr$;!)1BzDNfuZ`_%H$89=q^&Y0B;JT-wt9`Adz^ZJb3VA@S{5m&RPnrZ28%Wr^SmMZdhi`tC zj-0Km7Xv0Y59|A93S4kSf^&XX#j4FHVtj)v!@5s#w+{r(L#RTfNk|zmN>cA%4eR>l zY26N;pZ{+v^@lmoQgtp#q#w$)AEP`%nERYkwef!$~h(rFqD_ikG;9DmY;! z0Ve|3{m7N)<3PSLv`&^Nmdm%o$)g9U@1+Wu<}-j%FYD00{Lzgo@hkG6Uh$`y+Oh}+ zgsLL+Dci6p(wLu)Fh{jxtfQoz<-^KiA_@kPv}-mDl{@Fd5u@fXg*vw>8|eS|pUxB_ ze)NhXfSd+`C>j9)!}LN6K#QI--Pz*)gUbr&ul=~VpoK7?YYPd(q*jbct*k)i%4ZY+ zO0-0@L^-v99EBf&r=h{WUk81WVuCNVq-$ILKuv|EhN1mP0;-E1k)C5{L3w-+-o7+ z#ym?i`6s;gMyU~P$wf~u(-uYq@?9SE)>~U4LVWIxE#zNf7+Z);Q>5cEkxC62CQE{x zW@v;=WIzc40Q?|Bf!}F_42>QTblpM*5-fzkEBLPAn55B5+? ngTns9JpaA^d;RzN|L6KYDyA+w5Vwta00000NkvXXu0mjfKAGgy literal 0 HcmV?d00001 diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index 15d39e09a3..3afac04732 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -669,6 +669,7 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type // printf("Num gemms: %zu, M: %zu, N: %zu, K: %zu, group_sizes: %zu, lhs_is_trans: %d, rhs_is_trans: %d, is_grouped_dense_wgrad: %d\n", num_gemms, m, n, k, group_sizes.dimensions()[0], lhs_is_trans, rhs_is_trans, is_grouped_dense_wgrad); if (is_grouped_dense_wgrad) { + NVTE_CHECK(false, "wgrad not supported"); //// RHS NVTEShape rhsShape{.data={k, n}, .ndim=2}; // rhs_is_trans = true; @@ -713,27 +714,46 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type // Nominal case for FWD or DGRAD //// RHS - NVTEShape rhsShape{.data={num_gemms*k, n}, .ndim=2}; + NVTEShape rhsShape{.data={num_gemms * k, n}, .ndim=2}; // rhs_is_trans = true; - // if (rhs_is_trans) { - // std::swap(rhsShape.data[0], rhsShape.data[1]); - // } - NVTE_CHECK(!rhs_is_trans, "GroupedGemmFFI currently only supports rhs_is_trans=false"); + + printf("GroupedGemmFFI: (lhs_is_trans=%d, rhs_is_trans=%d) m=%zu, k=%zu, n=%zu, rhs_shape=[", lhs_is_trans, rhs_is_trans, m, k, n); + for (auto dim : rhs_data.dimensions()) { + printf("%zu, ", dim); + } + printf("], lhs_shape=["); + for (auto dim : lhs_data.dimensions()) { + printf("%zu, ", dim); + } + printf("], out_shape=["); + for (auto dim : output->dimensions()) { + printf("%zu, ", dim); + } + printf("]\n"); + + + if (rhs_is_trans) { + rhsShape.data[0] = num_gemms * n; + rhsShape.data[1] = k; + // std::swap(rhsShape.data[0], rhsShape.data[1]); + } + // NVTE_CHECK(!rhs_is_trans, "GroupedGemmFFI currently only supports rhs_is_trans=false"); auto rhs_tensor = make_grouped_tensor(rhs_data, rhs_sinv, scaling_mode, num_gemms, rhsShape); //// LHS NVTEShape lhsShape{.data={m, k}, .ndim=2}; // NVTE_CHECK(lhs_is_trans, "GroupedGemmFFI currently only supports lhs_is_trans=true"); // lhs_is_trans = true; - // if (!lhs_is_trans) { - // std::swap(lhsShape.data[0], lhsShape.data[1]); - // } if (!lhs_is_trans) { - cudaMemsetAsync(output->untyped_data(), 0, output->size_bytes(), stream); - return ffi_with_cuda_error_check(); + std::swap(lhsShape.data[0], lhsShape.data[1]); } + // if (!lhs_is_trans) { + // printf("GroupedGemmFFI: lhs_is_trans=false, m=%zu, k=%zu, n=%zu\n", m, k, n); + // cudaMemsetAsync(output->untyped_data(), 0, output->size_bytes(), stream); + // return ffi_with_cuda_error_check(); + // } auto lhs_tensor = make_grouped_tensor(lhs_data, lhs_sinv, scaling_mode, num_gemms, lhsShape); - lhs_tensor.set_group_info(group_sizes, group_offset_lhs, kNVTEGroupedFirstDims); + lhs_tensor.set_group_info(group_sizes, group_offset_lhs, lhs_is_trans ? kNVTEGroupedFirstDims : kNVTEGroupedLastDims); //// OUTPUT NVTEShape outShape{.data={m, n}, .ndim=2}; diff --git a/transformer_engine/jax/dense.py b/transformer_engine/jax/dense.py index ed1e5dfc38..4296a88f32 100644 --- a/transformer_engine/jax/dense.py +++ b/transformer_engine/jax/dense.py @@ -665,7 +665,7 @@ def _grouped_dense_bwd_rule( dkernel_amax = None # HACK - dgrad = jnp.zeros_like(dgrad) + # dgrad = jnp.zeros_like(dgrad) wgrad = jnp.zeros_like(wgrad) return dgrad, wgrad, group_sizes_grad, dbias, dkernel_amax, quantizer_set From e43ba1f864222e879feed5a71d910099513ceeaf Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Mon, 2 Feb 2026 15:39:40 -0800 Subject: [PATCH 72/98] JAX C++ extension RTC --- setup.py | 20 +- transformer_engine/jax/__init__.py | 3 + transformer_engine/jax/rtc/__init__.py | 1 + transformer_engine/jax/rtc/rtc.py | 157 +++++++++++ transformer_engine/jax/rtc/utils.py | 372 +++++++++++++++++++++++++ transformer_engine/jax/setup.py | 6 +- 6 files changed, 546 insertions(+), 13 deletions(-) create mode 100644 transformer_engine/jax/rtc/__init__.py create mode 100644 transformer_engine/jax/rtc/rtc.py create mode 100644 transformer_engine/jax/rtc/utils.py diff --git a/setup.py b/setup.py index 18bb736f24..f3bff3efce 100644 --- a/setup.py +++ b/setup.py @@ -223,16 +223,16 @@ def git_check_submodules() -> None: current_file_path / "transformer_engine", ) ) - if "jax" in frameworks: - from build_tools.jax import setup_jax_extension - - ext_modules.append( - setup_jax_extension( - "transformer_engine/jax/csrc", - current_file_path / "transformer_engine" / "jax" / "csrc", - current_file_path / "transformer_engine", - ) - ) + # if "jax" in frameworks: + # from build_tools.jax import setup_jax_extension + + # ext_modules.append( + # setup_jax_extension( + # "transformer_engine/jax/csrc", + # current_file_path / "transformer_engine" / "jax" / "csrc", + # current_file_path / "transformer_engine", + # ) + # ) # Configure package setuptools.setup( diff --git a/transformer_engine/jax/__init__.py b/transformer_engine/jax/__init__.py index d0afc1ff25..ef249db94c 100644 --- a/transformer_engine/jax/__init__.py +++ b/transformer_engine/jax/__init__.py @@ -29,6 +29,9 @@ from transformer_engine.common import load_framework_extension +from . import rtc +print("Compiling JAX RTC extension...") +rtc.compile_extension() load_framework_extension("jax") from . import flax diff --git a/transformer_engine/jax/rtc/__init__.py b/transformer_engine/jax/rtc/__init__.py new file mode 100644 index 0000000000..795e5e7efc --- /dev/null +++ b/transformer_engine/jax/rtc/__init__.py @@ -0,0 +1 @@ +from .rtc import compile_extension \ No newline at end of file diff --git a/transformer_engine/jax/rtc/rtc.py b/transformer_engine/jax/rtc/rtc.py new file mode 100644 index 0000000000..a76f8b318c --- /dev/null +++ b/transformer_engine/jax/rtc/rtc.py @@ -0,0 +1,157 @@ +# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +"""JAX related extensions.""" +import os +from pathlib import Path +from packaging import version + +import setuptools + +from .utils import get_cuda_include_dirs, all_files_in_dir, debug_build_enabled +from typing import List + + +def install_requirements() -> List[str]: + """Install dependencies for TE/JAX extensions.""" + return ["jax", "flax>=0.7.1"] + + +def test_requirements() -> List[str]: + """Test dependencies for TE/JAX extensions. + + Triton Package Selection: + The triton package is selected based on NVTE_USE_PYTORCH_TRITON environment variable: + + Default (NVTE_USE_PYTORCH_TRITON unset or "0"): + Returns 'triton' - OpenAI's standard package from PyPI. + Install with: pip install triton + + NVTE_USE_PYTORCH_TRITON=1: + Returns 'pytorch-triton' - for mixed JAX+PyTorch environments. + Install with: pip install pytorch-triton --index-url https://download.pytorch.org/whl/cu121 + + Note: Do NOT install pytorch-triton from PyPI directly - that's a placeholder. + """ + use_pytorch_triton = bool(int(os.environ.get("NVTE_USE_PYTORCH_TRITON", "0"))) + + triton_package = "pytorch-triton" if use_pytorch_triton else "triton" + + return [ + "numpy", + triton_package, + ] + + +def xla_path() -> str: + """XLA root path lookup. + Throws FileNotFoundError if XLA source is not found.""" + + try: + import jax + + if version.parse(jax.__version__) >= version.parse("0.5.0"): + from jax import ffi # pylint: disable=ungrouped-imports + else: + from jax.extend import ffi # pylint: disable=ungrouped-imports + + except ImportError: + if os.getenv("XLA_HOME"): + xla_home = Path(os.getenv("XLA_HOME")) + else: + xla_home = "/opt/xla" + else: + xla_home = ffi.include_dir() + + if not os.path.isdir(xla_home): + raise FileNotFoundError("Could not find xla source.") + return xla_home + + +def setup_jax_extension( + csrc_source_files, + csrc_header_files, + common_header_files, +) -> setuptools.Extension: + """Setup PyBind11 extension for JAX support""" + # Source files + csrc_source_files = Path(csrc_source_files) + extensions_dir = csrc_source_files / "extensions" + sources = all_files_in_dir(extensions_dir, name_extension="cpp") + + # Header files + include_dirs = get_cuda_include_dirs() + include_dirs.extend( + [ + common_header_files, + common_header_files / "common", + common_header_files / "common" / "include", + csrc_header_files, + xla_path(), + ] + ) + print("Includ dirs for JAX extension:", include_dirs) + + # Compile flags + cxx_flags = ["-O3"] + if debug_build_enabled(): + cxx_flags.append("-g") + cxx_flags.append("-UNDEBUG") + else: + cxx_flags.append("-g0") + + # Define TE/JAX as a Pybind11Extension + from pybind11.setup_helpers import Pybind11Extension + + return Pybind11Extension( + "transformer_engine_jax", + sources=[str(path) for path in sources], + include_dirs=[str(path) for path in include_dirs], + extra_compile_args=cxx_flags, + libraries=["nccl"], + ) + +_compiled = False + +def compile_extension(): + import os + import shutil + + global _compiled + if _compiled: + return + + base_dir = Path(os.path.dirname(__file__)).parent.parent.parent + te_jax_build_dir = base_dir / "build" / "te_jax" + # if os.path.exists(te_jax_build_dir): + # shutil.rmtree(te_jax_build_dir) + + ext = setup_jax_extension( + Path(__file__).resolve().parent.parent / "csrc", + Path(__file__).resolve().parent.parent / "csrc", + Path(__file__).resolve().parent.parent.parent, + ) + from pybind11.setup_helpers import build_ext as BuildExtension + from setuptools import Distribution + import subprocess + + dist = Distribution() + dist.ext_modules = [ext] + cmd = BuildExtension(dist) + cmd.initialize_options() + cmd.parallel = os.cpu_count() # Enable parallel compilation + cmd.finalize_options() + cmd.build_temp = os.path.join(te_jax_build_dir, "temp") + cmd.build_lib = os.path.join(te_jax_build_dir, "lib") + os.makedirs(cmd.build_temp, exist_ok=True) + os.makedirs(cmd.build_lib, exist_ok=True) + cmd.run() + + subprocess.call([ + "cp", + os.path.join(cmd.build_lib, "transformer_engine_jax" + cmd.get_ext_filename(fullname="")), + base_dir, + ]) + + _compiled = True \ No newline at end of file diff --git a/transformer_engine/jax/rtc/utils.py b/transformer_engine/jax/rtc/utils.py new file mode 100644 index 0000000000..8a52440310 --- /dev/null +++ b/transformer_engine/jax/rtc/utils.py @@ -0,0 +1,372 @@ +# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +"""Installation script.""" + +import functools +import glob +import importlib +import os +import re +import shutil +import subprocess +import sys +import platform +from pathlib import Path +from importlib.metadata import version as get_version +from subprocess import CalledProcessError +from typing import List, Optional, Tuple, Union + + +# Needs to stay consistent with .pre-commit-config.yaml config. +def min_python_version() -> Tuple[int]: + """Minimum supported Python version.""" + return (3, 10, 0) + + +def min_python_version_str() -> str: + """String representing minimum supported Python version.""" + return ".".join(map(str, min_python_version())) + + +if sys.version_info < min_python_version(): + raise RuntimeError( + f"Transformer Engine requires Python {min_python_version_str()} or newer, " + f"but found Python {platform.python_version()}." + ) + + +@functools.lru_cache(maxsize=None) +def debug_build_enabled() -> bool: + """Whether to build with a debug configuration""" + return bool(int(os.getenv("NVTE_BUILD_DEBUG", "0"))) + + +@functools.lru_cache(maxsize=None) +def get_max_jobs_for_parallel_build() -> int: + """Number of parallel jobs for Nina build""" + + # Default: maximum parallel jobs + num_jobs = 0 + + # Check environment variable + if os.getenv("NVTE_BUILD_MAX_JOBS"): + num_jobs = int(os.getenv("NVTE_BUILD_MAX_JOBS")) + elif os.getenv("MAX_JOBS"): + num_jobs = int(os.getenv("MAX_JOBS")) + + # Check command-line arguments + for arg in sys.argv.copy(): + if arg.startswith("--parallel="): + num_jobs = int(arg.replace("--parallel=", "")) + sys.argv.remove(arg) + + return num_jobs + + +def all_files_in_dir(path, name_extension=None): + all_files = [] + for dirname, _, names in os.walk(path): + for name in names: + if name_extension is not None and not name.endswith(f".{name_extension}"): + continue + all_files.append(Path(dirname, name)) + return all_files + + +def remove_dups(_list: List): + return list(set(_list)) + + +def found_cmake() -> bool: + """ "Check if valid CMake is available + + CMake 3.18 or newer is required. + + """ + + # Check if CMake is available + try: + _cmake_bin = cmake_bin() + except FileNotFoundError: + return False + + # Query CMake for version info + output = subprocess.run( + [_cmake_bin, "--version"], + capture_output=True, + check=True, + universal_newlines=True, + ) + match = re.search(r"version\s*([\d.]+)", output.stdout) + version = match.group(1).split(".") + version = tuple(int(v) for v in version) + return version >= (3, 18) + + +def cmake_bin() -> Path: + """Get CMake executable + + Throws FileNotFoundError if not found. + + """ + + # Search in CMake Python package + _cmake_bin: Optional[Path] = None + try: + from cmake import CMAKE_BIN_DIR + except ImportError: + pass + else: + _cmake_bin = Path(CMAKE_BIN_DIR).resolve() / "cmake" + if not _cmake_bin.is_file(): + _cmake_bin = None + + # Search in path + if _cmake_bin is None: + _cmake_bin = shutil.which("cmake") + if _cmake_bin is not None: + _cmake_bin = Path(_cmake_bin).resolve() + + # Return executable if found + if _cmake_bin is None: + raise FileNotFoundError("Could not find CMake executable") + return _cmake_bin + + +def found_ninja() -> bool: + """ "Check if Ninja is available""" + return shutil.which("ninja") is not None + + +def found_pybind11() -> bool: + """ "Check if pybind11 is available""" + + # Check if Python package is installed + try: + import pybind11 + except ImportError: + pass + else: + return True + + # Check if CMake can find pybind11 + if not found_cmake(): + return False + try: + subprocess.run( + [ + "cmake", + "--find-package", + "-DMODE=EXIST", + "-DNAME=pybind11", + "-DCOMPILER_ID=CXX", + "-DLANGUAGE=CXX", + ], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + check=True, + ) + except (CalledProcessError, OSError): + pass + else: + return True + return False + + +@functools.lru_cache(maxsize=None) +def cuda_toolkit_include_path() -> Tuple[str, str]: + """Returns root path for cuda toolkit includes. + + return `None` if CUDA is not found.""" + # Try finding CUDA + cuda_home: Optional[Path] = None + if cuda_home is None and os.getenv("CUDA_HOME"): + # Check in CUDA_HOME + cuda_home = Path(os.getenv("CUDA_HOME")) / "include" + if cuda_home is None: + # Check in NVCC + nvcc_bin = shutil.which("nvcc") + if nvcc_bin is not None: + cuda_home = Path(nvcc_bin.rstrip("/bin/nvcc")) / "include" + if cuda_home is None: + # Last-ditch guess in /usr/local/cuda + if Path("/usr/local/cuda").is_dir(): + cuda_home = Path("/usr/local/cuda") / "include" + return cuda_home + + +@functools.lru_cache(maxsize=None) +def nvcc_path() -> Tuple[str, str]: + """Returns the NVCC binary path. + + Throws FileNotFoundError if NVCC is not found.""" + # Try finding NVCC + nvcc_bin: Optional[Path] = None + if nvcc_bin is None and os.getenv("CUDA_HOME"): + # Check in CUDA_HOME + cuda_home = Path(os.getenv("CUDA_HOME")) + nvcc_bin = cuda_home / "bin" / "nvcc" + if nvcc_bin is None: + # Check if nvcc is in path + nvcc_bin = shutil.which("nvcc") + if nvcc_bin is not None: + cuda_home = Path(nvcc_bin.rstrip("/bin/nvcc")) + nvcc_bin = Path(nvcc_bin) + if nvcc_bin is None: + # Last-ditch guess in /usr/local/cuda + cuda_home = Path("/usr/local/cuda") + nvcc_bin = cuda_home / "bin" / "nvcc" + if not nvcc_bin.is_file(): + raise FileNotFoundError(f"Could not find NVCC at {nvcc_bin}") + + return nvcc_bin + + +@functools.lru_cache(maxsize=None) +def get_cuda_include_dirs() -> Tuple[str, str]: + """Returns the CUDA header directory.""" + + # If cuda is installed via toolkit, all necessary headers + # are bundled inside the top level cuda directory. + if cuda_toolkit_include_path() is not None: + return [cuda_toolkit_include_path()] + + # Use pip wheels to include all headers. + try: + import nvidia + except ModuleNotFoundError as e: + raise RuntimeError("CUDA not found.") + + cuda_root = Path(nvidia.__file__).parent + return [ + subdir / "include" + for subdir in cuda_root.iterdir() + if subdir.is_dir() and (subdir / "include").is_dir() + ] + + +@functools.lru_cache(maxsize=None) +def cuda_archs() -> str: + archs = os.getenv("NVTE_CUDA_ARCHS") + if archs is None: + version = cuda_version() + if version >= (13, 0): + archs = "75;80;89;90;100;120" + elif version >= (12, 8): + archs = "70;80;89;90;100;120" + else: + archs = "70;80;89;90" + return archs + + +def cuda_version() -> Tuple[int, ...]: + """CUDA Toolkit version as a (major, minor) tuple. + + Try to get cuda version by locating the nvcc executable and running nvcc --version. If + nvcc is not found, look for the cuda runtime package pip `nvidia-cuda-runtime-cu12` + and check pip version. + """ + + try: + nvcc_bin = nvcc_path() + except FileNotFoundError as e: + pass + else: + output = subprocess.run( + [nvcc_bin, "-V"], + capture_output=True, + check=True, + universal_newlines=True, + ) + match = re.search(r"release\s*([\d.]+)", output.stdout) + version = match.group(1).split(".") + return tuple(int(v) for v in version) + + try: + version_str = get_version("nvidia-cuda-runtime-cu12") + version_tuple = tuple(int(part) for part in version_str.split(".") if part.isdigit()) + return version_tuple + except importlib.metadata.PackageNotFoundError: + raise RuntimeError("Could neither find NVCC executable nor CUDA runtime Python package.") + + +def get_frameworks() -> List[str]: + """DL frameworks to build support for""" + _frameworks: List[str] = [] + supported_frameworks = ["pytorch", "jax"] + + # Check environment variable + if os.getenv("NVTE_FRAMEWORK"): + _frameworks.extend(os.getenv("NVTE_FRAMEWORK").split(",")) + + # Check command-line arguments + for arg in sys.argv.copy(): + if arg.startswith("--framework="): + _frameworks.extend(arg.replace("--framework=", "").split(",")) + sys.argv.remove(arg) + + # Detect installed frameworks if not explicitly specified + if not _frameworks: + try: + import torch + except ImportError: + pass + else: + _frameworks.append("pytorch") + try: + import jax + except ImportError: + pass + else: + _frameworks.append("jax") + + # Special framework names + if "all" in _frameworks: + _frameworks = supported_frameworks.copy() + if "none" in _frameworks: + _frameworks = [] + + # Check that frameworks are valid + _frameworks = [framework.lower() for framework in _frameworks] + for framework in _frameworks: + if framework not in supported_frameworks: + raise ValueError(f"Transformer Engine does not support framework={framework}") + + return _frameworks + + +def copy_common_headers( + src_dir: Union[Path, str], + dst_dir: Union[Path, str], +) -> None: + """Copy headers from core library + + src_dir should be the transformer_engine directory within the root + Transformer Engine repository. All .h and .cuh files within + transformer_engine/common are copied into dst_dir. Relative paths + are preserved. + + """ + + # Find common header files in src dir + headers = glob.glob( + os.path.join(str(src_dir), "common", "**", "*.h"), + recursive=True, + ) + headers.extend( + glob.glob( + os.path.join(str(src_dir), "common", "**", "*.cuh"), + recursive=True, + ) + ) + headers = [Path(path) for path in headers] + + # Copy common header files to dst dir + src_dir = Path(src_dir) + dst_dir = Path(dst_dir) + for path in headers: + new_path = dst_dir / path.relative_to(src_dir) + new_path.parent.mkdir(exist_ok=True, parents=True) + shutil.copy(path, new_path) diff --git a/transformer_engine/jax/setup.py b/transformer_engine/jax/setup.py index 2d25242825..f0a304c1c2 100644 --- a/transformer_engine/jax/setup.py +++ b/transformer_engine/jax/setup.py @@ -108,9 +108,9 @@ def get_cuda_major_version() -> int: common_headers_dir = "common_headers" copy_common_headers(current_file_path.parent, str(current_file_path / common_headers_dir)) ext_modules = [ - setup_jax_extension( - "csrc", current_file_path / "csrc", current_file_path / common_headers_dir - ) + # setup_jax_extension( + # "csrc", current_file_path / "csrc", current_file_path / common_headers_dir + # ) ] # Setup version and requirements. From 63fadd0d3135dadb76ba2a21610bf807013b307a Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Mon, 2 Feb 2026 16:08:34 -0800 Subject: [PATCH 73/98] wip --- transformer_engine/jax/csrc/extensions/gemm.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index 3afac04732..5b4ce0736f 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -729,7 +729,7 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type for (auto dim : output->dimensions()) { printf("%zu, ", dim); } - printf("]\n"); + printf("]\n"); if (rhs_is_trans) { @@ -770,8 +770,8 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type cudaMemsetAsync(output->untyped_data(), 0, output->size_bytes(), stream); nvte_grouped_gemm( - rhs_tensor, rhs_is_trans, lhs_tensor, lhs_is_trans, + rhs_tensor, rhs_is_trans, nullptr, out_tensor, alpha_tensor.data(), From b2d9e26c77cfc3ce877246f68799497d2476d764 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Mon, 2 Feb 2026 16:22:29 -0800 Subject: [PATCH 74/98] wgrad might be working! --- .../jax/csrc/extensions/gemm.cpp | 33 ++++++++++--------- transformer_engine/jax/dense.py | 2 +- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index 5b4ce0736f..48a120284a 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -669,7 +669,22 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type // printf("Num gemms: %zu, M: %zu, N: %zu, K: %zu, group_sizes: %zu, lhs_is_trans: %d, rhs_is_trans: %d, is_grouped_dense_wgrad: %d\n", num_gemms, m, n, k, group_sizes.dimensions()[0], lhs_is_trans, rhs_is_trans, is_grouped_dense_wgrad); if (is_grouped_dense_wgrad) { - NVTE_CHECK(false, "wgrad not supported"); + printf("GroupedGemmFFI: (lhs_is_trans=%d, rhs_is_trans=%d) m=%zu, k=%zu, n=%zu, rhs_shape=[", lhs_is_trans, rhs_is_trans, m, k, n); + for (auto dim : rhs_data.dimensions()) { + printf("%zu, ", dim); + } + printf("], lhs_shape=["); + for (auto dim : lhs_data.dimensions()) { + printf("%zu, ", dim); + } + printf("], out_shape=["); + for (auto dim : output->dimensions()) { + printf("%zu, ", dim); + } + printf("]\n"); + + NVTE_CHECK(lhs_is_trans && !rhs_is_trans, "For grouped dense wgrad, only TN GEMM is supported in TE/JAX currently."); + //// RHS NVTEShape rhsShape{.data={k, n}, .ndim=2}; // rhs_is_trans = true; @@ -698,8 +713,8 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type cudaMemsetAsync(output->untyped_data(), 0, output->size_bytes(), stream); nvte_grouped_gemm( - rhs_tensor, rhs_is_trans, lhs_tensor, lhs_is_trans, + rhs_tensor, rhs_is_trans, nullptr, out_tensor, alpha_tensor.data(), @@ -717,20 +732,6 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type NVTEShape rhsShape{.data={num_gemms * k, n}, .ndim=2}; // rhs_is_trans = true; - printf("GroupedGemmFFI: (lhs_is_trans=%d, rhs_is_trans=%d) m=%zu, k=%zu, n=%zu, rhs_shape=[", lhs_is_trans, rhs_is_trans, m, k, n); - for (auto dim : rhs_data.dimensions()) { - printf("%zu, ", dim); - } - printf("], lhs_shape=["); - for (auto dim : lhs_data.dimensions()) { - printf("%zu, ", dim); - } - printf("], out_shape=["); - for (auto dim : output->dimensions()) { - printf("%zu, ", dim); - } - printf("]\n"); - if (rhs_is_trans) { rhsShape.data[0] = num_gemms * n; diff --git a/transformer_engine/jax/dense.py b/transformer_engine/jax/dense.py index 4296a88f32..012d37cf22 100644 --- a/transformer_engine/jax/dense.py +++ b/transformer_engine/jax/dense.py @@ -666,7 +666,7 @@ def _grouped_dense_bwd_rule( # HACK # dgrad = jnp.zeros_like(dgrad) - wgrad = jnp.zeros_like(wgrad) + # wgrad = jnp.zeros_like(wgrad) return dgrad, wgrad, group_sizes_grad, dbias, dkernel_amax, quantizer_set From 994b6865192d04258d9309f6c29c37994b029325 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Tue, 3 Feb 2026 09:44:50 -0800 Subject: [PATCH 75/98] wip --- .../jax/csrc/extensions/gemm.cpp | 48 +++++++++++++------ 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index 48a120284a..b59bbc158d 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -414,7 +414,19 @@ class JAXX_GroupedTensorWrapper { JAXX_GroupedTensorWrapper() = delete; JAXX_GroupedTensorWrapper(JAXX_Scaling_Mode scaling_mode, size_t num_tensors, NVTEShape const& dataShape); - ~JAXX_GroupedTensorWrapper() = default; + JAXX_GroupedTensorWrapper(JAXX_GroupedTensorWrapper const&) = delete; + JAXX_GroupedTensorWrapper& operator=(JAXX_GroupedTensorWrapper const&) = delete; + JAXX_GroupedTensorWrapper(JAXX_GroupedTensorWrapper&& other) noexcept + : m_data_shape(other.m_data_shape), + m_grouped_tensor(other.m_grouped_tensor), + m_data_tensor(other.m_data_tensor), + m_scale_inv_tensor(other.m_scale_inv_tensor), + m_sizes_tensor(other.m_sizes_tensor), + m_offsets_tensor(other.m_offsets_tensor) { + other.m_grouped_tensor = nullptr; + } + JAXX_GroupedTensorWrapper& operator=(JAXX_GroupedTensorWrapper&&) = delete; + ~JAXX_GroupedTensorWrapper(); void set_rowwise(Buffer_Type const& data, std::optional const& scale_inv); void set_group_info(Buffer_Type const& group_sizes, Buffer_Type const& group_offsets, @@ -442,6 +454,12 @@ JAXX_GroupedTensorWrapper::JAXX_GroupedTensorWrapper(JAXX_Scaling_Mode scaling_m m_grouped_tensor = nvte_create_grouped_tensor(get_nvte_scaling_mode(scaling_mode), num_tensors, dataShape); } +JAXX_GroupedTensorWrapper::~JAXX_GroupedTensorWrapper() { + if (m_grouped_tensor != nullptr) { + nvte_destroy_grouped_tensor(m_grouped_tensor); + } +} + void JAXX_GroupedTensorWrapper::set_rowwise(Buffer_Type const& data, std::optional const& scale_inv) { // printf("set_rowwise data shape: XLA buffer shape: "); @@ -528,7 +546,7 @@ JAXX_GroupedTensorWrapper make_grouped_tensor(Buffer_Type const& data, std::opti } grouped_tensor_wrapper.set_rowwise(data, scale_inv); - return grouped_tensor_wrapper; + return std::move(grouped_tensor_wrapper); } Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type lhs_sinv, @@ -669,19 +687,19 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type // printf("Num gemms: %zu, M: %zu, N: %zu, K: %zu, group_sizes: %zu, lhs_is_trans: %d, rhs_is_trans: %d, is_grouped_dense_wgrad: %d\n", num_gemms, m, n, k, group_sizes.dimensions()[0], lhs_is_trans, rhs_is_trans, is_grouped_dense_wgrad); if (is_grouped_dense_wgrad) { - printf("GroupedGemmFFI: (lhs_is_trans=%d, rhs_is_trans=%d) m=%zu, k=%zu, n=%zu, rhs_shape=[", lhs_is_trans, rhs_is_trans, m, k, n); - for (auto dim : rhs_data.dimensions()) { - printf("%zu, ", dim); - } - printf("], lhs_shape=["); - for (auto dim : lhs_data.dimensions()) { - printf("%zu, ", dim); - } - printf("], out_shape=["); - for (auto dim : output->dimensions()) { - printf("%zu, ", dim); - } - printf("]\n"); + // printf("GroupedGemmFFI: (lhs_is_trans=%d, rhs_is_trans=%d) m=%zu, k=%zu, n=%zu, rhs_shape=[", lhs_is_trans, rhs_is_trans, m, k, n); + // for (auto dim : rhs_data.dimensions()) { + // printf("%zu, ", dim); + // } + // printf("], lhs_shape=["); + // for (auto dim : lhs_data.dimensions()) { + // printf("%zu, ", dim); + // } + // printf("], out_shape=["); + // for (auto dim : output->dimensions()) { + // printf("%zu, ", dim); + // } + // printf("]\n"); NVTE_CHECK(lhs_is_trans && !rhs_is_trans, "For grouped dense wgrad, only TN GEMM is supported in TE/JAX currently."); From 223eeff36a35f40835ccc950e20b097753ae22d7 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Tue, 3 Feb 2026 13:26:32 -0800 Subject: [PATCH 76/98] wip (memcpy to host for debugging) --- .../jax/csrc/extensions/gemm.cpp | 46 ++++++++++++++++++- transformer_engine/jax/dense.py | 4 +- 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index b59bbc158d..83dea142e0 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -741,6 +741,9 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type workspace_cublas.data(), nullptr, // config (use defaults) stream); + + cudaStreamSynchronize(stream); + return ffi_with_cuda_error_check(); } @@ -786,8 +789,22 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type // printf("rhs_is_trans: %d, lhs_is_trans: %d\n", rhs_is_trans, lhs_is_trans); + // This memset is required because the group sizes may not fill the full buffer since we overallocate for the worst case. However, in theory unused space on the grouped axis should not be utilizied downstream, but it seems like somehow it is utilized. cudaMemsetAsync(output->untyped_data(), 0, output->size_bytes(), stream); + std::vector host_group_sizes(num_gemms); + cudaMemcpyAsync(host_group_sizes.data(), group_sizes.untyped_data(), + num_gemms * sizeof(int32_t), cudaMemcpyDeviceToHost, stream); + cudaStreamSynchronize(stream); + + int currentDevice; + cudaGetDevice(¤tDevice); + printf("[gpu=%d] Group sizes[total_group_size=%zu, m=%zu]: ", currentDevice, std::accumulate(host_group_sizes.begin(), host_group_sizes.end(), 0ULL), m); + for (size_t i = 0; i < num_gemms; ++i) { + printf("%d, ", host_group_sizes[i]); + } + printf("\n"); + nvte_grouped_gemm( lhs_tensor, lhs_is_trans, rhs_tensor, rhs_is_trans, @@ -799,6 +816,31 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type workspace_cublas.data(), nullptr, // config (use defaults) stream); + size_t _offset = std::accumulate(host_group_sizes.begin(), host_group_sizes.end(), 0ULL) * n * out_dtype_bytes; + cudaMemsetAsync(output->untyped_data() + _offset, 0, output->size_bytes() - _offset, stream); + + std::vector<__bf16> debug_output(m*n); + cudaMemcpyAsync(debug_output.data(), output->untyped_data(), + m * n * out_dtype_bytes, cudaMemcpyDeviceToHost, stream); + cudaStreamSynchronize(stream); + + size_t totalPrints = 0; + constexpr size_t MAX_PRINTS = 1; + for (size_t i_m = 0; i_m < m; i_m++) { + for (size_t i_n = 0; i_n < n; i_n++) { + size_t index = i_m * n + i_n; + if (isnan(static_cast(debug_output[index])) || isinf(static_cast(debug_output[index]))) { + printf("[gpu=%d] Output contains NaN or Inf at index [%zu, %zu] (flat index %zu)\n", i_m, i_n, index); + totalPrints++; + if (totalPrints >= MAX_PRINTS) { + break; + } + } + } + if (totalPrints >= MAX_PRINTS) { + break; + } + } return ffi_with_cuda_error_check(); } @@ -826,8 +868,8 @@ XLA_FFI_DEFINE_HANDLER_SYMBOL(GroupedGemmHandler, GroupedGemmFFI, .Attr("scaling_mode") .Attr("has_bias") .Attr("is_grouped_dense_wgrad") - .Attr("use_async_d2h_group_sizes"), - FFI_CudaGraph_Traits); + .Attr("use_async_d2h_group_sizes")/*, + FFI_CudaGraph_Traits*/); } // namespace jax } // namespace transformer_engine diff --git a/transformer_engine/jax/dense.py b/transformer_engine/jax/dense.py index 012d37cf22..ed1e5dfc38 100644 --- a/transformer_engine/jax/dense.py +++ b/transformer_engine/jax/dense.py @@ -665,8 +665,8 @@ def _grouped_dense_bwd_rule( dkernel_amax = None # HACK - # dgrad = jnp.zeros_like(dgrad) - # wgrad = jnp.zeros_like(wgrad) + dgrad = jnp.zeros_like(dgrad) + wgrad = jnp.zeros_like(wgrad) return dgrad, wgrad, group_sizes_grad, dbias, dkernel_amax, quantizer_set From f693220efc357c64aa79b463945d342fd7b66a7d Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Wed, 4 Feb 2026 14:50:34 -0800 Subject: [PATCH 77/98] initial debug of inspect ffi Signed-off-by: Jeremy Berchtold --- transformer_engine/jax/cpp_extensions/gemm.py | 8 +- transformer_engine/jax/csrc/extensions.h | 3 + .../jax/csrc/extensions/amax.cpp | 23 ++++ .../jax/csrc/extensions/pybind.cpp | 3 + transformer_engine/jax/inspect.py | 111 ++++++++++++++++++ 5 files changed, 144 insertions(+), 4 deletions(-) create mode 100644 transformer_engine/jax/inspect.py diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py index 71f133bfc4..d400412386 100644 --- a/transformer_engine/jax/cpp_extensions/gemm.py +++ b/transformer_engine/jax/cpp_extensions/gemm.py @@ -373,10 +373,10 @@ def assert_cublas_requirements(scaling_mode, contracting_size, tensor_name): # Requirements from https://docs.nvidia.com/cuda/cublas/#tensor-core-usage alignment = 32 if scaling_mode.is_nvfp4_scaling else 16 - assert contracting_size % alignment == 0, ( - f"cuBLAS GEMM {tensor_name} tensor's contracting dimension must be a multiple of" - f" {alignment} when using quantized inputs. Got contracting_size={contracting_size}" - ) + # assert contracting_size % alignment == 0, ( + # f"cuBLAS GEMM {tensor_name} tensor's contracting dimension must be a multiple of" + # f" {alignment} when using quantized inputs. Got contracting_size={contracting_size}" + # ) class GemmPrimitive(BasePrimitive): diff --git a/transformer_engine/jax/csrc/extensions.h b/transformer_engine/jax/csrc/extensions.h index 3fd086e257..1c0bc52b88 100644 --- a/transformer_engine/jax/csrc/extensions.h +++ b/transformer_engine/jax/csrc/extensions.h @@ -143,6 +143,9 @@ XLA_FFI_DECLARE_HANDLER_SYMBOL(GroupedGemmHandler); XLA_FFI_DECLARE_HANDLER_SYMBOL(RHTAmaxCalculationInitializeHandler); XLA_FFI_DECLARE_HANDLER_SYMBOL(RHTAmaxCalculationHandler); +// Inspect +XLA_FFI_DECLARE_HANDLER_SYMBOL(InspectHandler); + // Cudnn helpers XLA_FFI_DECLARE_HANDLER_SYMBOL(CudnnHandleInitHandler); diff --git a/transformer_engine/jax/csrc/extensions/amax.cpp b/transformer_engine/jax/csrc/extensions/amax.cpp index 5ffccaffb4..61cfa206c3 100644 --- a/transformer_engine/jax/csrc/extensions/amax.cpp +++ b/transformer_engine/jax/csrc/extensions/amax.cpp @@ -96,5 +96,28 @@ XLA_FFI_DEFINE_HANDLER_SYMBOL( .Attr("produce_regular_amax") // produce_regular_amax .Attr("flatten_axis")); // flatten_axis + +Error_Type InspectFFI(cudaStream_t stream, Buffer_Type input_buf, Result_Type output_buf) { + NVTE_CHECK(input_buf.untyped_data() != nullptr, + "Input must be provided for inspect operation"); + NVTE_CHECK(output_buf->untyped_data() != nullptr, + "Output must be provided for inspect operation"); + NVTE_CHECK(input_buf.untyped_data() == output_buf->untyped_data(), + "Input and output must point to the same buffer for inspect operation"); + + printf("JTEST: Hello\n"); + + return ffi_with_cuda_error_check(); +} + +XLA_FFI_DEFINE_HANDLER_SYMBOL( + InspectHandler, InspectFFI, + FFI::Bind() + .Ctx() // stream + .Arg() // input + .Ret() // output + ); + + } // namespace jax } // namespace transformer_engine diff --git a/transformer_engine/jax/csrc/extensions/pybind.cpp b/transformer_engine/jax/csrc/extensions/pybind.cpp index a5986404c9..3f05b57077 100644 --- a/transformer_engine/jax/csrc/extensions/pybind.cpp +++ b/transformer_engine/jax/csrc/extensions/pybind.cpp @@ -81,6 +81,9 @@ pybind11::dict Registrations() { pybind11::arg("initialize") = EncapsulateFFI(RHTAmaxCalculationInitializeHandler), pybind11::arg("execute") = EncapsulateFFI(RHTAmaxCalculationHandler)); + dict["te_inspect_ffi"] = pybind11::dict( + pybind11::arg("execute") = EncapsulateFFI(InspectHandler)); + return dict; } diff --git a/transformer_engine/jax/inspect.py b/transformer_engine/jax/inspect.py new file mode 100644 index 0000000000..849cfb4491 --- /dev/null +++ b/transformer_engine/jax/inspect.py @@ -0,0 +1,111 @@ +# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. +"""JAX array inspection utilities.""" + +from functools import partial + +import jax +import jax.numpy as jnp +from jax import ffi + +from .cpp_extensions.base import BasePrimitive, register_primitive + +__all__ = ["inspect_array"] + + +class InspectPrimitive(BasePrimitive): + """ + No-op used for inspect array values. + """ + + name = "te_inspect_ffi" + multiple_results = False + impl_static_args = () + inner_primitive = None + outer_primitive = None + + @staticmethod + def abstract( + x_aval, + ): + """ + inspect abstract + """ + return x_aval + + @staticmethod + def lowering( + ctx, + x, + ): + """ + inspect lowering rules + """ + + return ffi.ffi_lowering( + InspectPrimitive.name, + operand_output_aliases={0: 0}, # donate input buffer to output buffer + )( + ctx, + x, + ) + + @staticmethod + def impl( + x, + ): + """ + inspect implementation + """ + assert InspectPrimitive.inner_primitive is not None + ( + x + ) = InspectPrimitive.inner_primitive.bind( + x, + ) + return x + +register_primitive(InspectPrimitive) + +@partial(jax.custom_vjp, nondiff_argnums=()) +def _inspect( + x, +): + """ + """ + output, _ = _inspect_fwd_rule( + x, + ) + return output + + +def _inspect_fwd_rule( + x, +): + """""" + ctx = () + x = InspectPrimitive.outer_primitive.bind(x) + return x, ctx + + +def _inspect_bwd_rule( + ctx, + grad, +): + """""" + del ctx + return grad, + + +_inspect.defvjp(_inspect_fwd_rule, _inspect_bwd_rule) + +def inspect_array(x: jnp.ndarray, name: str) -> jnp.ndarray: + """Utility function to inspect JAX arrays by printing their name, shape, dtype, and statistics. + + Args: + x (jnp.ndarray): The JAX array to inspect. + name (str): The name of the array for identification in the output. + """ + # TODO: Handle the name of the tensor in the primitive and output files + return _inspect(x) \ No newline at end of file From f2d1629f3a4ce364dad7fba24688ae4a16dbb4c0 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Wed, 4 Feb 2026 15:16:33 -0800 Subject: [PATCH 78/98] writing binary dumps of tensors works Signed-off-by: Jeremy Berchtold --- .../jax/csrc/extensions/amax.cpp | 34 ++++++++++++++----- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/transformer_engine/jax/csrc/extensions/amax.cpp b/transformer_engine/jax/csrc/extensions/amax.cpp index 61cfa206c3..52ef9c47fb 100644 --- a/transformer_engine/jax/csrc/extensions/amax.cpp +++ b/transformer_engine/jax/csrc/extensions/amax.cpp @@ -6,6 +6,7 @@ #include #include +#include #include "../extensions.h" #include "transformer_engine/cast.h" @@ -98,16 +99,33 @@ XLA_FFI_DEFINE_HANDLER_SYMBOL( Error_Type InspectFFI(cudaStream_t stream, Buffer_Type input_buf, Result_Type output_buf) { - NVTE_CHECK(input_buf.untyped_data() != nullptr, - "Input must be provided for inspect operation"); - NVTE_CHECK(output_buf->untyped_data() != nullptr, - "Output must be provided for inspect operation"); - NVTE_CHECK(input_buf.untyped_data() == output_buf->untyped_data(), - "Input and output must point to the same buffer for inspect operation"); + NVTE_CHECK(input_buf.untyped_data() != nullptr, + "Input must be provided for inspect operation"); + NVTE_CHECK(output_buf->untyped_data() != nullptr, + "Output must be provided for inspect operation"); + NVTE_CHECK(input_buf.untyped_data() == output_buf->untyped_data(), + "Input and output must point to the same buffer for inspect operation"); - printf("JTEST: Hello\n"); - return ffi_with_cuda_error_check(); + std::vector input_data(input_buf.size_bytes()); + cudaMemcpyAsync(input_data.data(), input_buf.untyped_data(), input_buf.size_bytes(), + cudaMemcpyDeviceToHost, stream); + cudaStreamSynchronize(stream); + + int device; + cudaGetDevice(&device); + + std::string filename = "my_tensor_gpu" + std::to_string(device) + ".bin"; + std::ofstream file(filename, std::ios::binary); + if (file.is_open()) { + file.write(reinterpret_cast(input_data.data()), input_data.size()); + file.close(); + } + printf("Tensor data written to %s\n", filename.c_str()); + + // TODO: make a metadata file with tensor shape and dtype? + + return ffi_with_cuda_error_check(); } XLA_FFI_DEFINE_HANDLER_SYMBOL( From f56d8696bbcdc33649fab13198ec9eb557de9581 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Wed, 4 Feb 2026 15:27:10 -0800 Subject: [PATCH 79/98] loading works Signed-off-by: Jeremy Berchtold --- .../jax/csrc/extensions/amax.cpp | 10 ++++++++- transformer_engine/jax/inspect.py | 21 +++++++++++++++++-- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/transformer_engine/jax/csrc/extensions/amax.cpp b/transformer_engine/jax/csrc/extensions/amax.cpp index 52ef9c47fb..97728303a2 100644 --- a/transformer_engine/jax/csrc/extensions/amax.cpp +++ b/transformer_engine/jax/csrc/extensions/amax.cpp @@ -121,7 +121,15 @@ Error_Type InspectFFI(cudaStream_t stream, Buffer_Type input_buf, Result_Type ou file.write(reinterpret_cast(input_data.data()), input_data.size()); file.close(); } - printf("Tensor data written to %s\n", filename.c_str()); + printf("Tensor data written to %s (shape: [", filename.c_str()); + for (size_t i = 0; i < input_buf.dimensions().size(); ++i) { + printf("%ld", static_cast(input_buf.dimensions()[i])); + if (i < input_buf.dimensions().size() - 1) { + printf(", "); + } + } + printf("], dtype: %d)\n", static_cast(input_buf.element_type())); + // TODO: make a metadata file with tensor shape and dtype? diff --git a/transformer_engine/jax/inspect.py b/transformer_engine/jax/inspect.py index 849cfb4491..d9f8d70bc9 100644 --- a/transformer_engine/jax/inspect.py +++ b/transformer_engine/jax/inspect.py @@ -11,7 +11,7 @@ from .cpp_extensions.base import BasePrimitive, register_primitive -__all__ = ["inspect_array"] +__all__ = ["inspect_array", "load_array_dump"] class InspectPrimitive(BasePrimitive): @@ -108,4 +108,21 @@ def inspect_array(x: jnp.ndarray, name: str) -> jnp.ndarray: name (str): The name of the array for identification in the output. """ # TODO: Handle the name of the tensor in the primitive and output files - return _inspect(x) \ No newline at end of file + return _inspect(x) + + +def load_array_dump(filename: str, shape: tuple, dtype: jnp.dtype) -> jnp.ndarray: + """Utility function to load a JAX array from a dumped binary file. + + Args: + filename (str): The path to the binary file containing the array data. + shape (tuple): The shape of the array to be loaded. + dtype (jnp.dtype): The data type of the array to be loaded. + + Returns: + jnp.ndarray: The loaded JAX array. + """ + with open(filename, "rb") as f: + data = f.read() + array = jnp.frombuffer(data, dtype=dtype).reshape(shape) + return array \ No newline at end of file From 37a7dd5b059ee4e200cb7139e22fbe1b03f83912 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 4 Feb 2026 23:28:53 +0000 Subject: [PATCH 80/98] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../jax/csrc/extensions/amax.cpp | 78 +++++++++---------- .../jax/csrc/extensions/pybind.cpp | 4 +- transformer_engine/jax/inspect.py | 16 ++-- 3 files changed, 46 insertions(+), 52 deletions(-) diff --git a/transformer_engine/jax/csrc/extensions/amax.cpp b/transformer_engine/jax/csrc/extensions/amax.cpp index 97728303a2..e18ee99d81 100644 --- a/transformer_engine/jax/csrc/extensions/amax.cpp +++ b/transformer_engine/jax/csrc/extensions/amax.cpp @@ -5,8 +5,8 @@ ************************************************************************/ #include -#include #include +#include #include "../extensions.h" #include "transformer_engine/cast.h" @@ -97,53 +97,47 @@ XLA_FFI_DEFINE_HANDLER_SYMBOL( .Attr("produce_regular_amax") // produce_regular_amax .Attr("flatten_axis")); // flatten_axis - Error_Type InspectFFI(cudaStream_t stream, Buffer_Type input_buf, Result_Type output_buf) { - NVTE_CHECK(input_buf.untyped_data() != nullptr, - "Input must be provided for inspect operation"); - NVTE_CHECK(output_buf->untyped_data() != nullptr, - "Output must be provided for inspect operation"); - NVTE_CHECK(input_buf.untyped_data() == output_buf->untyped_data(), - "Input and output must point to the same buffer for inspect operation"); - - - std::vector input_data(input_buf.size_bytes()); - cudaMemcpyAsync(input_data.data(), input_buf.untyped_data(), input_buf.size_bytes(), - cudaMemcpyDeviceToHost, stream); - cudaStreamSynchronize(stream); - - int device; - cudaGetDevice(&device); - - std::string filename = "my_tensor_gpu" + std::to_string(device) + ".bin"; - std::ofstream file(filename, std::ios::binary); - if (file.is_open()) { - file.write(reinterpret_cast(input_data.data()), input_data.size()); - file.close(); - } - printf("Tensor data written to %s (shape: [", filename.c_str()); - for (size_t i = 0; i < input_buf.dimensions().size(); ++i) { - printf("%ld", static_cast(input_buf.dimensions()[i])); - if (i < input_buf.dimensions().size() - 1) { - printf(", "); - } + NVTE_CHECK(input_buf.untyped_data() != nullptr, "Input must be provided for inspect operation"); + NVTE_CHECK(output_buf->untyped_data() != nullptr, + "Output must be provided for inspect operation"); + NVTE_CHECK(input_buf.untyped_data() == output_buf->untyped_data(), + "Input and output must point to the same buffer for inspect operation"); + + std::vector input_data(input_buf.size_bytes()); + cudaMemcpyAsync(input_data.data(), input_buf.untyped_data(), input_buf.size_bytes(), + cudaMemcpyDeviceToHost, stream); + cudaStreamSynchronize(stream); + + int device; + cudaGetDevice(&device); + + std::string filename = "my_tensor_gpu" + std::to_string(device) + ".bin"; + std::ofstream file(filename, std::ios::binary); + if (file.is_open()) { + file.write(reinterpret_cast(input_data.data()), input_data.size()); + file.close(); + } + printf("Tensor data written to %s (shape: [", filename.c_str()); + for (size_t i = 0; i < input_buf.dimensions().size(); ++i) { + printf("%ld", static_cast(input_buf.dimensions()[i])); + if (i < input_buf.dimensions().size() - 1) { + printf(", "); } - printf("], dtype: %d)\n", static_cast(input_buf.element_type())); - + } + printf("], dtype: %d)\n", static_cast(input_buf.element_type())); - // TODO: make a metadata file with tensor shape and dtype? + // TODO: make a metadata file with tensor shape and dtype? - return ffi_with_cuda_error_check(); + return ffi_with_cuda_error_check(); } -XLA_FFI_DEFINE_HANDLER_SYMBOL( - InspectHandler, InspectFFI, - FFI::Bind() - .Ctx() // stream - .Arg() // input - .Ret() // output - ); - +XLA_FFI_DEFINE_HANDLER_SYMBOL(InspectHandler, InspectFFI, + FFI::Bind() + .Ctx() // stream + .Arg() // input + .Ret() // output +); } // namespace jax } // namespace transformer_engine diff --git a/transformer_engine/jax/csrc/extensions/pybind.cpp b/transformer_engine/jax/csrc/extensions/pybind.cpp index 3f05b57077..5a8ee18f09 100644 --- a/transformer_engine/jax/csrc/extensions/pybind.cpp +++ b/transformer_engine/jax/csrc/extensions/pybind.cpp @@ -81,8 +81,8 @@ pybind11::dict Registrations() { pybind11::arg("initialize") = EncapsulateFFI(RHTAmaxCalculationInitializeHandler), pybind11::arg("execute") = EncapsulateFFI(RHTAmaxCalculationHandler)); - dict["te_inspect_ffi"] = pybind11::dict( - pybind11::arg("execute") = EncapsulateFFI(InspectHandler)); + dict["te_inspect_ffi"] = + pybind11::dict(pybind11::arg("execute") = EncapsulateFFI(InspectHandler)); return dict; } diff --git a/transformer_engine/jax/inspect.py b/transformer_engine/jax/inspect.py index d9f8d70bc9..61bbaf8bb0 100644 --- a/transformer_engine/jax/inspect.py +++ b/transformer_engine/jax/inspect.py @@ -45,7 +45,7 @@ def lowering( return ffi.ffi_lowering( InspectPrimitive.name, - operand_output_aliases={0: 0}, # donate input buffer to output buffer + operand_output_aliases={0: 0}, # donate input buffer to output buffer )( ctx, x, @@ -59,21 +59,20 @@ def impl( inspect implementation """ assert InspectPrimitive.inner_primitive is not None - ( - x - ) = InspectPrimitive.inner_primitive.bind( + (x) = InspectPrimitive.inner_primitive.bind( x, ) return x + register_primitive(InspectPrimitive) + @partial(jax.custom_vjp, nondiff_argnums=()) def _inspect( x, ): - """ - """ + """ """ output, _ = _inspect_fwd_rule( x, ) @@ -95,11 +94,12 @@ def _inspect_bwd_rule( ): """""" del ctx - return grad, + return (grad,) _inspect.defvjp(_inspect_fwd_rule, _inspect_bwd_rule) + def inspect_array(x: jnp.ndarray, name: str) -> jnp.ndarray: """Utility function to inspect JAX arrays by printing their name, shape, dtype, and statistics. @@ -125,4 +125,4 @@ def load_array_dump(filename: str, shape: tuple, dtype: jnp.dtype) -> jnp.ndarra with open(filename, "rb") as f: data = f.read() array = jnp.frombuffer(data, dtype=dtype).reshape(shape) - return array \ No newline at end of file + return array From f3a9fad0404abcbb6a51edb178f88f0ae647e069 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 5 Feb 2026 17:56:22 +0000 Subject: [PATCH 81/98] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- test_einsum.py | 51 +++-- tests/jax/test_custom_call_compute.py | 110 ++++++---- transformer_engine/common/cast/cast.cu | 4 +- .../common/gemm/cublaslt_gemm.cu | 2 +- transformer_engine/jax/__init__.py | 1 + transformer_engine/jax/cpp_extensions/base.py | 6 +- transformer_engine/jax/cpp_extensions/gemm.py | 77 ++++--- .../jax/cpp_extensions/quantization.py | 10 +- .../jax/csrc/extensions/gemm.cpp | 199 +++++++++--------- .../jax/csrc/extensions/quantization.cpp | 7 +- transformer_engine/jax/dense.py | 82 ++++---- transformer_engine/jax/flax/__init__.py | 7 +- transformer_engine/jax/flax/module.py | 169 ++++++++------- transformer_engine/jax/rtc/__init__.py | 2 +- transformer_engine/jax/rtc/rtc.py | 18 +- transformer_engine/jax/sharding.py | 5 +- 16 files changed, 418 insertions(+), 332 deletions(-) diff --git a/test_einsum.py b/test_einsum.py index 5bb05403f2..1b1f502c51 100644 --- a/test_einsum.py +++ b/test_einsum.py @@ -4,29 +4,39 @@ import jax.numpy as jnp import numpy as np import transformer_engine.jax as te -from transformer_engine.common.recipe import Recipe, Float8CurrentScaling, MXFP8BlockScaling, DelayedScaling, NVFP4BlockScaling +from transformer_engine.common.recipe import ( + Recipe, + Float8CurrentScaling, + MXFP8BlockScaling, + DelayedScaling, + NVFP4BlockScaling, +) from flax import linen as nn + def make_einsum_cls(quantization_recipe): def te_einsum(generate_quantizer_set, s, x, kernel, **kwargs): - def dot_general(x, kernel, dims, *args, **kwargs): - contracting_dims, batch_dims = dims - assert batch_dims == ((), ()), "Batch dims not supported in TE/JAX yet" - - quantizer_set = generate_quantizer_set("quantizer_set_for_einsum") - return te.dense.dense( - x, - kernel, - contracting_dims=contracting_dims, - quantizer_set=quantizer_set, - ) - return jnp.einsum(s, x, kernel, _dot_general=dot_general, **kwargs) - + def dot_general(x, kernel, dims, *args, **kwargs): + contracting_dims, batch_dims = dims + assert batch_dims == ((), ()), "Batch dims not supported in TE/JAX yet" + + quantizer_set = generate_quantizer_set("quantizer_set_for_einsum") + return te.dense.dense( + x, + kernel, + contracting_dims=contracting_dims, + quantizer_set=quantizer_set, + ) + + return jnp.einsum(s, x, kernel, _dot_general=dot_general, **kwargs) + return te.flax.wrap_function_in_te_state_module(te_einsum, quantization_recipe, "einsum")() + class EinsumType(Enum): - JAX = 'jax' - TE = 'te' + JAX = "jax" + TE = "te" + def main(): @@ -47,9 +57,10 @@ def _einsum(self, *args, **kwargs): @nn.compact def __call__(self, x): - kernel = self.param('kernel', jax.nn.initializers.lecun_normal(), (32, 32), jnp.bfloat16) + kernel = self.param( + "kernel", jax.nn.initializers.lecun_normal(), (32, 32), jnp.bfloat16 + ) return self._einsum("ij,jk->ik", x, kernel) - def test_model(einsum_type: EinsumType, quantization_recipe: Recipe = None): model = SimpleModel(einsum_type=einsum_type, quantization_recipe=quantization_recipe) @@ -68,7 +79,7 @@ def test_model(einsum_type: EinsumType, quantization_recipe: Recipe = None): # Compare outputs atol = float(jnp.finfo(jnp.float8_e4m3fn).eps) np.testing.assert_allclose(ref_out, te_out, atol=atol) - + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/tests/jax/test_custom_call_compute.py b/tests/jax/test_custom_call_compute.py index 737b7e2e06..cc8896da48 100644 --- a/tests/jax/test_custom_call_compute.py +++ b/tests/jax/test_custom_call_compute.py @@ -1762,11 +1762,8 @@ def ref_func(x, gamma, kernel_1, kernel_2, bias_1, bias_2): GROUPED_DENSE_INPUT_SHAPES = [ # (n_groups, m, n, k), the actual m will be multiplied by 32 # (5, 32, 128, 64), # Test the case where n_groups is not a multiple of 4 - # (4, 16, 4, 4), - # (3, 192, 64, 96), - (8, 16384, 14336, 4096), # (8, 16384, 16384, 4096), # (8, 64, 32, 128), @@ -1779,6 +1776,7 @@ def ref_func(x, gamma, kernel_1, kernel_2, bias_1, bias_2): ScalingMode.CURRENT_TENSOR_SCALING ] + @pytest_parametrize_wrapper("input_shape", GROUPED_DENSE_INPUT_SHAPES) class TestGroupedDense: def _ref_grouped_dense(self, lhs, rhs, bias, group_sizes, contracting_dims): @@ -1815,7 +1813,7 @@ def _generate_grouped_dense_input(self, dtype, input_shape, data_layout="NN", wi # m //= 32 # group_sizes = jnp.sort(jax.random.randint(subkeys[0], (n_groups - 1,), 0, m)) # group_sizes = jnp.concatenate([jnp.array([0]), group_sizes, jnp.array([m])]) - # group_sizes = jnp.diff(group_sizes) + # group_sizes = jnp.diff(group_sizes) # # Make one empty input lhs to test empty GEMM handling # group_sizes = group_sizes.at[0].set(group_sizes[0] + group_sizes[1]) @@ -1839,19 +1837,20 @@ def _generate_grouped_dense_input(self, dtype, input_shape, data_layout="NN", wi def load_tensor(name: str): import numpy as np - tensor = np.load(f'/mnt/jberchtold/polyphe-lustre-home/maxtext/gemm_{name}.npy') + + tensor = np.load(f"/mnt/jberchtold/polyphe-lustre-home/maxtext/gemm_{name}.npy") return jnp.array(tensor) - lhs = load_tensor('lhs').astype(dtype) - rhs = load_tensor('rhs').astype(dtype) + lhs = load_tensor("lhs").astype(dtype) + rhs = load_tensor("rhs").astype(dtype) bias = None - group_sizes = load_tensor('group_sizes').astype(jnp.int32) + group_sizes = load_tensor("group_sizes").astype(jnp.int32) lhs_contracting_dim = (1,) if data_layout[0] == "N" else (0,) rhs_contracting_dim = (1,) if data_layout[1] == "N" else (2,) contracting_dims = (lhs_contracting_dim, rhs_contracting_dim) - print(f'{lhs.shape=}, {rhs.shape=}, {group_sizes=}, {contracting_dims=}') + print(f"{lhs.shape=}, {rhs.shape=}, {group_sizes=}, {contracting_dims=}") # import pdb; pdb.set_trace() return lhs, rhs, group_sizes, contracting_dims, bias @@ -1859,45 +1858,46 @@ def load_tensor(name: str): def _tensor_to_image(self, tensor, value_range=None): import numpy as np from PIL import Image + # Convert to numpy tensor_np = jnp.array(tensor, dtype=jnp.float32) - + # Replace NaNs with a large value for visualization tensor_np = jnp.where(jnp.isnan(tensor_np), 5000, tensor_np) - + # Determine normalization range if value_range is None: min_val = tensor_np.min() max_val = tensor_np.max() else: min_val, max_val = value_range - + # Normalize to 0-255 range for visualization range_val = max_val - min_val + 1e-8 normalized = jnp.clip((tensor_np - min_val) / range_val * 255, 0, 255) - + # Downsample by averaging 4x4 blocks h, w = normalized.shape new_h, new_w = h // 4, w // 4 - normalized = normalized[:new_h*4, :new_w*4] # Trim to multiple of 4 + normalized = normalized[: new_h * 4, : new_w * 4] # Trim to multiple of 4 normalized = normalized.reshape(new_h, 4, new_w, 4).mean(axis=(1, 3)) normalized = np.array(normalized) normalized_uint8 = normalized.astype(np.uint8) - + # Create grayscale image - img = Image.fromarray(normalized_uint8, mode='L') + img = Image.fromarray(normalized_uint8, mode="L") return img def _assert_grouped_gemm_output(self, out, group_sizes, ref, dtype): assert out.dtype == ref.dtype print(f"Group sizes [{jnp.sum(group_sizes)}]: {group_sizes}") - self._tensor_to_image(out, value_range=(jnp.min(ref), jnp.max(ref))).save('output_te.png') - self._tensor_to_image(ref, value_range=(jnp.min(ref), jnp.max(ref))).save('output_ref.png') + self._tensor_to_image(out, value_range=(jnp.min(ref), jnp.max(ref))).save("output_te.png") + self._tensor_to_image(ref, value_range=(jnp.min(ref), jnp.max(ref))).save("output_ref.png") self._tensor_to_image( jnp.abs(out.astype(jnp.float32) - ref.astype(jnp.float32)), - value_range=(jnp.min(ref), jnp.max(ref)) + value_range=(jnp.min(ref), jnp.max(ref)), # value_range=(0, 0.5) - ).save('output_diff.png') + ).save("output_diff.png") assert_allclose(out, ref, dtype=jnp.float32) # ref_list = jnp.split(ref_list, jnp.cumulative_sum(group_sizes)[:-1], axis=0) # out_list = jnp.split(out, jnp.cumulative_sum(group_sizes)[:-1], axis=0) @@ -1906,8 +1906,8 @@ def _assert_grouped_gemm_output(self, out, group_sizes, ref, dtype): # for i in range(len(ref_list)): # print(f"Asserting output for group {i}, output shape: {out_list[i].shape}, ref shape: {ref_list[i].shape}") # assert_allclose( - # out_list[i], - # ref_list[i], + # out_list[i], + # ref_list[i], # dtype=dtype, #jnp.float8_e4m3fn # HACK: TE impl is close but not precise enough for 16-bit # ) @@ -1978,7 +1978,7 @@ def _ref_sum_grouped_dense(self, x, kernel, bias, group_sizes, contracting_dims) # Note: we use jnp.sum instead of jnp.mean to make the gradient larger # and prevent them from being clamp to zero in FP8. / sqrt(x.size) is used to # normalize the output and prevent the gradient from being too large for FP8. - out_sum_list = jnp.sum(out_list) # [jnp.sum(out) for out in out_list] + out_sum_list = jnp.sum(out_list) # [jnp.sum(out) for out in out_list] return jnp.sum(jnp.asarray(out_sum_list)) / jnp.sqrt(x.size) def _primitive_sum_grouped_dense( @@ -2020,9 +2020,16 @@ def test_grouped_dense_grad_fp16(self, dtype, input_shape): print("Hi") def write_images(prim, ref): - self._tensor_to_image(prim, value_range=(jnp.min(ref), jnp.max(ref))).save('output_te.png') - self._tensor_to_image(ref, value_range=(jnp.min(ref), jnp.max(ref))).save('output_ref.png') - self._tensor_to_image(jnp.abs(prim.astype(jnp.float32) - ref.astype(jnp.float32)), value_range=(jnp.min(ref), jnp.max(ref))).save('output_diff.png') + self._tensor_to_image(prim, value_range=(jnp.min(ref), jnp.max(ref))).save( + "output_te.png" + ) + self._tensor_to_image(ref, value_range=(jnp.min(ref), jnp.max(ref))).save( + "output_ref.png" + ) + self._tensor_to_image( + jnp.abs(prim.astype(jnp.float32) - ref.astype(jnp.float32)), + value_range=(jnp.min(ref), jnp.max(ref)), + ).save("output_diff.png") assert_allclose(prim_out_sum, ref_out_sum, dtype=dtype) assert_allclose(prim_dgrad, ref_dgrad, atol=0.015, rtol=0.75) @@ -2077,18 +2084,22 @@ def test_grouped_dense_grad_fp8(self, fwd_bwd_dtype, scaling_mode, input_shape): assert_allclose(prim_wgrad, ref_wgrad, dtype=bwd_dtype) # assert_allclose(prim_dbias, ref_dbias, dtype=dtype) -@pytest_parametrize_wrapper('eqn,a_shape,b_shape', [ - # ('ij,jk->ik', (64, 32), (32, 128)), - # ('bij,bjk->bik', (8, 64, 32), (8, 32, 128)), - # ('abc,cde->abde', (4, 8, 16), (16, 32, 64)), - ('BSM,BSEC->EBCM', (2, 16, 16), (2, 16, 8, 8)), - ('EBCM,EMH->EBCH', (8, 2, 1024, 4096), (8, 4096, 14336)) , - ('EBCM,EMH->EBCH', (8, 2, 1024, 4096), (8, 4096, 14336)), - ('EBCH,EHM->EBCM', (8, 2, 1024, 14336), (8, 14336, 4096)), - ('EBCM,BSEC->BSM', (8, 2, 1024, 4096), (2, 4096, 8, 1024)), -]) -@pytest_parametrize_wrapper('dtype', [jnp.bfloat16]) -@pytest_parametrize_wrapper('quantization_recipe', supported_recipes) + +@pytest_parametrize_wrapper( + "eqn,a_shape,b_shape", + [ + # ('ij,jk->ik', (64, 32), (32, 128)), + # ('bij,bjk->bik', (8, 64, 32), (8, 32, 128)), + # ('abc,cde->abde', (4, 8, 16), (16, 32, 64)), + ("BSM,BSEC->EBCM", (2, 16, 16), (2, 16, 8, 8)), + ("EBCM,EMH->EBCH", (8, 2, 1024, 4096), (8, 4096, 14336)), + ("EBCM,EMH->EBCH", (8, 2, 1024, 4096), (8, 4096, 14336)), + ("EBCH,EHM->EBCM", (8, 2, 1024, 14336), (8, 14336, 4096)), + ("EBCM,BSEC->BSM", (8, 2, 1024, 4096), (2, 4096, 8, 1024)), + ], +) +@pytest_parametrize_wrapper("dtype", [jnp.bfloat16]) +@pytest_parametrize_wrapper("quantization_recipe", supported_recipes) class TestEinsum: def _te_einsum(self, eqn, a, b, quantization_recipe): @@ -2113,7 +2124,9 @@ def test_einsum_fwd(self, eqn, a_shape, b_shape, dtype, quantization_recipe): a = jax.random.uniform(subkeys[0], a_shape, dtype=dtype) b = jax.random.uniform(subkeys[1], b_shape, dtype=dtype) - te_out = jax.jit(functools.partial(self._te_einsum, eqn, quantization_recipe=quantization_recipe))(a, b) + te_out = jax.jit( + functools.partial(self._te_einsum, eqn, quantization_recipe=quantization_recipe) + )(a, b) ref_out = jax.jit(functools.partial(self._ref_einsum, eqn))(a, b) # jax.config.update("jax_numpy_rank_promotion", "raise") @@ -2137,14 +2150,25 @@ def wrap_in_mean(f): @functools.wraps(f) def wrapped(*args): return jnp.mean(f(*args)) + return wrapped - te_fwd, te_grads = jax.jit(jax.value_and_grad(wrap_in_mean(functools.partial(self._te_einsum, eqn, quantization_recipe=quantization_recipe))))(a, b) - ref_fwd, ref_grads = jax.jit(jax.value_and_grad(wrap_in_mean(functools.partial(self._ref_einsum, eqn))))(a, b) + te_fwd, te_grads = jax.jit( + jax.value_and_grad( + wrap_in_mean( + functools.partial(self._te_einsum, eqn, quantization_recipe=quantization_recipe) + ) + ) + )(a, b) + ref_fwd, ref_grads = jax.jit( + jax.value_and_grad(wrap_in_mean(functools.partial(self._ref_einsum, eqn))) + )(a, b) assert_allclose(te_fwd, ref_fwd, dtype=dtype) - assert len(te_grads) == len(ref_grads), f"Number of gradients differ: {len(te_grads)=} vs {len(ref_grads)=}" + assert len(te_grads) == len( + ref_grads + ), f"Number of gradients differ: {len(te_grads)=} vs {len(ref_grads)=}" for te_grad, ref_grad in zip(te_grads, ref_grads): - assert_allclose(te_grad, ref_grad, dtype=dtype) \ No newline at end of file + assert_allclose(te_grad, ref_grad, dtype=dtype) diff --git a/transformer_engine/common/cast/cast.cu b/transformer_engine/common/cast/cast.cu index e8a4b9fcba..7149136d74 100644 --- a/transformer_engine/common/cast/cast.cu +++ b/transformer_engine/common/cast/cast.cu @@ -76,8 +76,8 @@ void nvte_multi_tensor_quantize(const NVTETensor *inputs, NVTETensor *outputs, constexpr bool IS_ACT = false; for (int i = 0; i < num_tensors; i++) { - dispatch::quantize_fwd_helper( - inputs[i], outputs[i], quant_configs, stream); + dispatch::quantize_fwd_helper(inputs[i], outputs[i], quant_configs, + stream); } } diff --git a/transformer_engine/common/gemm/cublaslt_gemm.cu b/transformer_engine/common/gemm/cublaslt_gemm.cu index 11543be501..241e30764a 100644 --- a/transformer_engine/common/gemm/cublaslt_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_gemm.cu @@ -155,7 +155,7 @@ GemmParam CanonicalizeGemmInput(const transformer_engine::Tensor &A, const cubla if (is_fp8_dtype(ret.Atype)) { // Requirements from https://docs.nvidia.com/cuda/cublas/#tensor-core-usage // NVTE_CHECK(ret.lda % 16 == 0, - // "Leading dimension requirement on A for FP8 GEMM. Caller must pad."); + // "Leading dimension requirement on A for FP8 GEMM. Caller must pad."); } } else if (nvfp4) { // NVFP4 GEMM. Either the pure NVFP4 recipe or the FWD pass of the Hybrid NVFP4/MXFP8 recipe. diff --git a/transformer_engine/jax/__init__.py b/transformer_engine/jax/__init__.py index ef249db94c..805b46fc39 100644 --- a/transformer_engine/jax/__init__.py +++ b/transformer_engine/jax/__init__.py @@ -30,6 +30,7 @@ from transformer_engine.common import load_framework_extension from . import rtc + print("Compiling JAX RTC extension...") rtc.compile_extension() load_framework_extension("jax") diff --git a/transformer_engine/jax/cpp_extensions/base.py b/transformer_engine/jax/cpp_extensions/base.py index 1e672ca451..c940c30ef1 100644 --- a/transformer_engine/jax/cpp_extensions/base.py +++ b/transformer_engine/jax/cpp_extensions/base.py @@ -216,7 +216,8 @@ def batcher(batched_args, batch_dims, *, arg1, arg2, arg3): elif arg.shape[bdim] != batch_size: raise ValueError( "All batched arguments must have the same batch size. " - f"Got sizes {[arg.shape[bdim] for arg, bdim in zip(batched_args, batch_dims) if bdim is not None]}. " + "Got sizes" + f" {[arg.shape[bdim] for arg, bdim in zip(batched_args, batch_dims) if bdim is not None]}. " f"Got batched_args={[arg.shape for arg, bdim in zip(batched_args, batch_dims) if bdim is not None]}." ) assert batch_dim is not None and batch_size is not None, "Invalid batching config!" @@ -255,7 +256,8 @@ def batcher(batched_args, batch_dims, *, arg1, arg2, arg3): # Stack each output along the batch dimension if output_bdims is not None: stacked_results = tuple( - jnp.stack(list(out_list), axis=out_bdim) for out_list, out_bdim in zip(transposed, output_bdims) + jnp.stack(list(out_list), axis=out_bdim) + for out_list, out_bdim in zip(transposed, output_bdims) ) else: stacked_results = tuple( diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py index 3c5807a921..83b3e7e700 100644 --- a/transformer_engine/jax/cpp_extensions/gemm.py +++ b/transformer_engine/jax/cpp_extensions/gemm.py @@ -818,8 +818,9 @@ def batcher( # f"got lhs_bdims={lhs_bdims}, rhs_bdims={rhs_bdims}" # ) - f = partial(GemmPrimitive.outer_impl, - **{ + f = partial( + GemmPrimitive.outer_impl, + **{ "out_dtype": out_dtype, "contracting_dims": contracting_dims, "scaling_mode": scaling_mode, @@ -831,16 +832,16 @@ def batcher( "transpose_batch_sequence": transpose_batch_sequence, "sequence_dim": sequence_dim, "is_outer": is_outer, - }) - + }, + ) + lhs_cdims, rhs_cdims = contracting_dims # Calculate output batch dimension based on input batch dims and contracting dims # Both lhs and rhs have batch dimensions that may be at different indices if lhs_bdims is not None and rhs_bdims is not None: # Count non-contracting dimensions in LHS before the batch dimension lhs_non_contracting_before_batch = sum( - 1 for i in range(lhs_bdims) - if i not in lhs_cdims + 1 for i in range(lhs_bdims) if i not in lhs_cdims ) # The output batch dimension will be at the position corresponding to # the LHS batch dimension's position among non-contracting dimensions @@ -850,8 +851,13 @@ def batcher( output_bdim = 0 elif rhs_bdims is not None: # RHS has a batch dimension - need to account for LHS non-contracting dims - lhs_non_contracting = len([i for i in range(len(batched_args[0].shape)) - if i not in lhs_cdims and i != lhs_bdims]) + lhs_non_contracting = len( + [ + i + for i in range(len(batched_args[0].shape)) + if i not in lhs_cdims and i != lhs_bdims + ] + ) output_bdim = lhs_non_contracting else: # No batch dimensions in either operand @@ -861,16 +867,16 @@ def batcher( return GemmPrimitive.batcher_impl( batched_args, batch_dims=( - lhs_bdims, # lhs - 0, # lhs_scale_inv - rhs_bdims, # rhs - 0, # rhs_scale_inv - *(None for _ in batched_args[4:]), # bias, gelu_input, alpha, beta + lhs_bdims, # lhs + 0, # lhs_scale_inv + rhs_bdims, # rhs + 0, # rhs_scale_inv + *(None for _ in batched_args[4:]), # bias, gelu_input, alpha, beta ), output_bdims=( - output_bdim, # output - 0, # bias_grad - 0, # pre_gelu_out + output_bdim, # output + 0, # bias_grad + 0, # pre_gelu_out ), static_kwargs={ "out_dtype": out_dtype, @@ -1539,7 +1545,9 @@ def abstract( workspace_size += lhs_scale_inv_aval.size + mxfp8_scaling_sinv_alignment_padding workspace_size += rhs_scale_inv_aval.size + mxfp8_scaling_sinv_alignment_padding - workspace_size += 1024*1024 # HACK: properly make a workspace_setup buffer in addition to the workspace_cublas buffer + workspace_size += ( + 1024 * 1024 + ) # HACK: properly make a workspace_setup buffer in addition to the workspace_cublas buffer workspace_aval = jax.core.ShapedArray(shape=(workspace_size,), dtype=jnp.uint8) out_shape = (M, N) @@ -2125,17 +2133,26 @@ def grouped_gemm( assert not has_bias or bias.shape == (group_sizes.size, N) bias = jnp.empty((), jnp.float32) if bias is None else bias - group_sizes = group_sizes.astype(jnp.int64) # Compute group_offset as cumulative sum of group_sizes, starting with 0 - group_offset = jnp.concatenate([jnp.array([0], dtype=jnp.int64), jnp.cumsum(group_sizes, dtype=jnp.int64)[:-1]]) + group_offset = jnp.concatenate( + [jnp.array([0], dtype=jnp.int64), jnp.cumsum(group_sizes, dtype=jnp.int64)[:-1]] + ) if is_grouped_dense_wgrad: - group_offset_lhs = group_offset * M # Offset is by number of elements total, not number of rows + group_offset_lhs = ( + group_offset * M + ) # Offset is by number of elements total, not number of rows # HACK: this _out is really the rhs in this case - group_offset_out = group_offset * 1 # Offset is by number of elements total, not number of rows - else: - group_offset_lhs = group_offset * K_lhs # Offset is by number of elements total, not number of rows - group_offset_out = group_offset * N # Offset is by number of elements total, not number of rows + group_offset_out = ( + group_offset * 1 + ) # Offset is by number of elements total, not number of rows + else: + group_offset_lhs = ( + group_offset * K_lhs + ) # Offset is by number of elements total, not number of rows + group_offset_out = ( + group_offset * N + ) # Offset is by number of elements total, not number of rows # jax.debug.print("group_sizes: {}, group_offset: {}", group_sizes, group_offset) # jax.debug.print("M={}, jnp.sum(group_sizes)={}, N={}, K_lhs={}", M, jnp.sum(group_sizes), N, K_lhs) @@ -2170,10 +2187,12 @@ def grouped_gemm( use_async_d2h_group_sizes=use_async_d2h_group_sizes, ) if not is_grouped_dense_wgrad: + def my_callback(lhs, rhs, group_sizes, out): if contracting_dims != ((1,), (2,)): return import numpy as np + lhs = np.array(lhs.astype(jnp.float32)) rhs = np.array(rhs.astype(jnp.float32)) group_sizes = np.array(group_sizes, dtype=group_sizes.dtype) @@ -2186,14 +2205,14 @@ def my_callback(lhs, rhs, group_sizes, out): if inputs_are_nan or not out_is_nan: return print("GroupedGemm NAN detected! cdims:", contracting_dims) - np.save('gemm_lhs.npy', lhs) - np.save('gemm_rhs.npy', rhs) - np.save('gemm_group_sizes.npy', group_sizes) + np.save("gemm_lhs.npy", lhs) + np.save("gemm_rhs.npy", rhs) + np.save("gemm_group_sizes.npy", group_sizes) return # jax.debug.callback(my_callback, - # lhs, rhs, group_sizes, out, - # ordered=True, partitioned=True) + # lhs, rhs, group_sizes, out, + # ordered=True, partitioned=True) # jax.debug.print("group_sizes: {}, lhs=[amax={}, mean={}, stddev={}], rhs=[amax={}, mean={}, stddev={}], out=[amax={}, mean={}, stddev={}]", # group_sizes, diff --git a/transformer_engine/jax/cpp_extensions/quantization.py b/transformer_engine/jax/cpp_extensions/quantization.py index 535e39d60e..a4c3655a54 100644 --- a/transformer_engine/jax/cpp_extensions/quantization.py +++ b/transformer_engine/jax/cpp_extensions/quantization.py @@ -96,7 +96,9 @@ def abstract( dtype = dtypes.canonicalize_dtype(x_aval.dtype) assert dtype in [jnp.float32, jnp.float16, jnp.bfloat16] out_shape = x_aval.shape - assert scale_aval is None or scale_aval.dtype == jnp.float32, f"scale must be float32 but received {scale_aval}" + assert ( + scale_aval is None or scale_aval.dtype == jnp.float32 + ), f"scale must be float32 but received {scale_aval}" if stochastic_rounding: assert ScalingMode( scaling_mode @@ -368,10 +370,12 @@ def batcher( batch_dims, output_bdims=( batch_dims[0], # out - batch_dims[0], # colwise_out (probably need to transpose according if scaling mode does it) + batch_dims[ + 0 + ], # colwise_out (probably need to transpose according if scaling mode does it) 0, # scale_inv 0, # colwise_scale_inv - 0, # updated_amax + 0, # updated_amax 0, # dbias ), static_kwargs={ diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index 83dea142e0..3048409ea4 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -410,13 +410,13 @@ XLA_FFI_DEFINE_HANDLER_SYMBOL(GroupedGemmD2HGroupSizesHandler, GroupedGemmD2HGro .Attr("num_gemms")); class JAXX_GroupedTensorWrapper { -public: + public: JAXX_GroupedTensorWrapper() = delete; JAXX_GroupedTensorWrapper(JAXX_Scaling_Mode scaling_mode, size_t num_tensors, - NVTEShape const& dataShape); - JAXX_GroupedTensorWrapper(JAXX_GroupedTensorWrapper const&) = delete; - JAXX_GroupedTensorWrapper& operator=(JAXX_GroupedTensorWrapper const&) = delete; - JAXX_GroupedTensorWrapper(JAXX_GroupedTensorWrapper&& other) noexcept + NVTEShape const &dataShape); + JAXX_GroupedTensorWrapper(JAXX_GroupedTensorWrapper const &) = delete; + JAXX_GroupedTensorWrapper &operator=(JAXX_GroupedTensorWrapper const &) = delete; + JAXX_GroupedTensorWrapper(JAXX_GroupedTensorWrapper &&other) noexcept : m_data_shape(other.m_data_shape), m_grouped_tensor(other.m_grouped_tensor), m_data_tensor(other.m_data_tensor), @@ -425,17 +425,17 @@ class JAXX_GroupedTensorWrapper { m_offsets_tensor(other.m_offsets_tensor) { other.m_grouped_tensor = nullptr; } - JAXX_GroupedTensorWrapper& operator=(JAXX_GroupedTensorWrapper&&) = delete; + JAXX_GroupedTensorWrapper &operator=(JAXX_GroupedTensorWrapper &&) = delete; ~JAXX_GroupedTensorWrapper(); - void set_rowwise(Buffer_Type const& data, std::optional const& scale_inv); - void set_group_info(Buffer_Type const& group_sizes, Buffer_Type const& group_offsets, + void set_rowwise(Buffer_Type const &data, std::optional const &scale_inv); + void set_group_info(Buffer_Type const &group_sizes, Buffer_Type const &group_offsets, NVTEGroupedTensorParam group_sizes_param_name); operator NVTEGroupedTensor() const { return m_grouped_tensor; } - NVTEGroupedTensor const& get_grouped_tensor() const; + NVTEGroupedTensor const &get_grouped_tensor() const; -private: + private: NVTEShape m_data_shape{}; NVTEGroupedTensor m_grouped_tensor{}; @@ -449,9 +449,10 @@ class JAXX_GroupedTensorWrapper { JAXX_GroupedTensorWrapper::JAXX_GroupedTensorWrapper(JAXX_Scaling_Mode scaling_mode, size_t num_tensors, - NVTEShape const& dataShape) { + NVTEShape const &dataShape) { m_data_shape = dataShape; - m_grouped_tensor = nvte_create_grouped_tensor(get_nvte_scaling_mode(scaling_mode), num_tensors, dataShape); + m_grouped_tensor = + nvte_create_grouped_tensor(get_nvte_scaling_mode(scaling_mode), num_tensors, dataShape); } JAXX_GroupedTensorWrapper::~JAXX_GroupedTensorWrapper() { @@ -460,8 +461,8 @@ JAXX_GroupedTensorWrapper::~JAXX_GroupedTensorWrapper() { } } -void JAXX_GroupedTensorWrapper::set_rowwise(Buffer_Type const& data, - std::optional const& scale_inv) { +void JAXX_GroupedTensorWrapper::set_rowwise(Buffer_Type const &data, + std::optional const &scale_inv) { // printf("set_rowwise data shape: XLA buffer shape: "); // for (auto dim : data.dimensions()) { // printf("%zu, ", dim); @@ -471,9 +472,10 @@ void JAXX_GroupedTensorWrapper::set_rowwise(Buffer_Type const& data, // printf("%d, ", m_data_shape.data[i]); // } // printf("\n"); - NVTEDType data_dtype = static_cast(convert_ffi_datatype_to_te_dtype(data.element_type())); - m_data_tensor = NVTEBasicTensor{reinterpret_cast(data.untyped_data()), data_dtype, - m_data_shape}; + NVTEDType data_dtype = + static_cast(convert_ffi_datatype_to_te_dtype(data.element_type())); + m_data_tensor = + NVTEBasicTensor{reinterpret_cast(data.untyped_data()), data_dtype, m_data_shape}; nvte_set_grouped_tensor_param(&m_grouped_tensor, kNVTEGroupedRowwiseData, &m_data_tensor); @@ -493,24 +495,22 @@ void JAXX_GroupedTensorWrapper::set_rowwise(Buffer_Type const& data, scale_inv->dimensions().size()); } m_scale_inv_tensor = NVTEBasicTensor{reinterpret_cast(scale_inv->untyped_data()), - scale_inv_dtype, logical_scale_shape}; + scale_inv_dtype, logical_scale_shape}; nvte_set_grouped_tensor_param(&m_grouped_tensor, kNVTEGroupedRowwiseScaleInv, &m_scale_inv_tensor); } } -void JAXX_GroupedTensorWrapper::set_group_info(Buffer_Type const& group_sizes, - Buffer_Type const& group_offsets, +void JAXX_GroupedTensorWrapper::set_group_info(Buffer_Type const &group_sizes, + Buffer_Type const &group_offsets, NVTEGroupedTensorParam group_sizes_param_name) { NVTEDType sizes_dtype = static_cast(convert_ffi_datatype_to_te_dtype(group_sizes.element_type())); NVTEDType offsets_dtype = static_cast(convert_ffi_datatype_to_te_dtype(group_offsets.element_type())); - NVTE_CHECK(sizes_dtype == NVTEDType::kNVTEInt64, - "group_sizes must be of type int64."); - NVTE_CHECK(offsets_dtype == NVTEDType::kNVTEInt64, - "group_offsets must be of type int64."); + NVTE_CHECK(sizes_dtype == NVTEDType::kNVTEInt64, "group_sizes must be of type int64."); + NVTE_CHECK(offsets_dtype == NVTEDType::kNVTEInt64, "group_offsets must be of type int64."); size_t num_tensors = group_sizes.dimensions()[0]; NVTE_CHECK(group_sizes.dimensions().size() == 1, @@ -525,21 +525,22 @@ void JAXX_GroupedTensorWrapper::set_group_info(Buffer_Type const& group_sizes, shape.data[0] = num_tensors; m_sizes_tensor = NVTEBasicTensor{reinterpret_cast(group_sizes.untyped_data()), - NVTEDType::kNVTEInt64, - shape}; + NVTEDType::kNVTEInt64, shape}; m_offsets_tensor = NVTEBasicTensor{reinterpret_cast(group_offsets.untyped_data()), - NVTEDType::kNVTEInt64, - shape}; + NVTEDType::kNVTEInt64, shape}; nvte_set_grouped_tensor_param(&m_grouped_tensor, group_sizes_param_name, &m_sizes_tensor); nvte_set_grouped_tensor_param(&m_grouped_tensor, kNVTEGroupedTensorOffsets, &m_offsets_tensor); } -NVTEGroupedTensor const& JAXX_GroupedTensorWrapper::get_grouped_tensor() const { +NVTEGroupedTensor const &JAXX_GroupedTensorWrapper::get_grouped_tensor() const { return m_grouped_tensor; } -JAXX_GroupedTensorWrapper make_grouped_tensor(Buffer_Type const& data, std::optional scale_inv, JAXX_Scaling_Mode scaling_mode, size_t num_tensors, NVTEShape const& dataShape) { +JAXX_GroupedTensorWrapper make_grouped_tensor(Buffer_Type const &data, + std::optional scale_inv, + JAXX_Scaling_Mode scaling_mode, size_t num_tensors, + NVTEShape const &dataShape) { JAXX_GroupedTensorWrapper grouped_tensor_wrapper(scaling_mode, num_tensors, dataShape); if (scaling_mode == JAXX_Scaling_Mode::NO_SCALING) { scale_inv = std::nullopt; @@ -551,12 +552,12 @@ JAXX_GroupedTensorWrapper make_grouped_tensor(Buffer_Type const& data, std::opti Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type lhs_sinv, Buffer_Type rhs_data, Buffer_Type rhs_sinv, Buffer_Type bias, - Buffer_Type group_sizes, Buffer_Type group_offset_lhs, Buffer_Type group_offset_out, - Buffer_Type alpha, Buffer_Type beta, - Result_Type output, Result_Type workspace, - size_t m, size_t n, size_t k, bool lhs_is_trans, - bool rhs_is_trans, JAXX_Scaling_Mode scaling_mode, bool has_bias, - bool is_grouped_dense_wgrad, bool use_async_d2h_group_sizes) { + Buffer_Type group_sizes, Buffer_Type group_offset_lhs, + Buffer_Type group_offset_out, Buffer_Type alpha, Buffer_Type beta, + Result_Type output, Result_Type workspace, size_t m, size_t n, size_t k, + bool lhs_is_trans, bool rhs_is_trans, JAXX_Scaling_Mode scaling_mode, + bool has_bias, bool is_grouped_dense_wgrad, + bool use_async_d2h_group_sizes) { // Notes on matrix layouts and transpose: // Jax uses row-major data_layout, on entering this function, each input matrix pair: // A: row-major [m, k] for N - [k, m] for T @@ -672,52 +673,55 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type "got lhs_is_trans=", lhs_is_trans, ", rhs_is_trans=", rhs_is_trans); } - constexpr size_t workspace_setup_size = 1024 * 1024; // HACK: dummy workspace for setup - TensorWrapper workspace_setup(workspace_ptr, - std::vector{workspace_setup_size}, DType::kByte); + constexpr size_t workspace_setup_size = 1024 * 1024; // HACK: dummy workspace for setup + TensorWrapper workspace_setup(workspace_ptr, std::vector{workspace_setup_size}, + DType::kByte); TensorWrapper workspace_cublas(workspace_ptr + workspace_setup_size, - std::vector{workspace_size}, DType::kByte); - - TensorWrapper alpha_tensor(static_cast(alpha.untyped_data()), std::vector{num_gemms}, - convert_ffi_datatype_to_te_dtype(alpha.element_type())); - TensorWrapper beta_tensor(static_cast(beta.untyped_data()), std::vector{num_gemms}, - convert_ffi_datatype_to_te_dtype(beta.element_type())); + std::vector{workspace_size}, DType::kByte); + TensorWrapper alpha_tensor(static_cast(alpha.untyped_data()), + std::vector{num_gemms}, + convert_ffi_datatype_to_te_dtype(alpha.element_type())); + TensorWrapper beta_tensor(static_cast(beta.untyped_data()), + std::vector{num_gemms}, + convert_ffi_datatype_to_te_dtype(beta.element_type())); // printf("Num gemms: %zu, M: %zu, N: %zu, K: %zu, group_sizes: %zu, lhs_is_trans: %d, rhs_is_trans: %d, is_grouped_dense_wgrad: %d\n", num_gemms, m, n, k, group_sizes.dimensions()[0], lhs_is_trans, rhs_is_trans, is_grouped_dense_wgrad); if (is_grouped_dense_wgrad) { - // printf("GroupedGemmFFI: (lhs_is_trans=%d, rhs_is_trans=%d) m=%zu, k=%zu, n=%zu, rhs_shape=[", lhs_is_trans, rhs_is_trans, m, k, n); - // for (auto dim : rhs_data.dimensions()) { - // printf("%zu, ", dim); - // } - // printf("], lhs_shape=["); - // for (auto dim : lhs_data.dimensions()) { - // printf("%zu, ", dim); - // } - // printf("], out_shape=["); - // for (auto dim : output->dimensions()) { - // printf("%zu, ", dim); - // } - // printf("]\n"); - - NVTE_CHECK(lhs_is_trans && !rhs_is_trans, "For grouped dense wgrad, only TN GEMM is supported in TE/JAX currently."); + // printf("GroupedGemmFFI: (lhs_is_trans=%d, rhs_is_trans=%d) m=%zu, k=%zu, n=%zu, rhs_shape=[", lhs_is_trans, rhs_is_trans, m, k, n); + // for (auto dim : rhs_data.dimensions()) { + // printf("%zu, ", dim); + // } + // printf("], lhs_shape=["); + // for (auto dim : lhs_data.dimensions()) { + // printf("%zu, ", dim); + // } + // printf("], out_shape=["); + // for (auto dim : output->dimensions()) { + // printf("%zu, ", dim); + // } + // printf("]\n"); + + NVTE_CHECK(lhs_is_trans && !rhs_is_trans, + "For grouped dense wgrad, only TN GEMM is supported in TE/JAX currently."); //// RHS - NVTEShape rhsShape{.data={k, n}, .ndim=2}; + NVTEShape rhsShape{.data = {k, n}, .ndim = 2}; // rhs_is_trans = true; auto rhs_tensor = make_grouped_tensor(rhs_data, rhs_sinv, scaling_mode, num_gemms, rhsShape); rhs_tensor.set_group_info(group_sizes, group_offset_out, kNVTEGroupedFirstDims); //// LHS - NVTEShape lhsShape{.data={m, k}, .ndim=2}; + NVTEShape lhsShape{.data = {m, k}, .ndim = 2}; lhs_is_trans = true; auto lhs_tensor = make_grouped_tensor(lhs_data, lhs_sinv, scaling_mode, num_gemms, lhsShape); lhs_tensor.set_group_info(group_sizes, group_offset_lhs, kNVTEGroupedLastDims); //// OUTPUT - NVTEShape outShape{.data={num_gemms*m, n}, .ndim=2}; - auto out_tensor = make_grouped_tensor(*output, std::nullopt, JAXX_Scaling_Mode::NO_SCALING, num_gemms, outShape); + NVTEShape outShape{.data = {num_gemms * m, n}, .ndim = 2}; + auto out_tensor = make_grouped_tensor(*output, std::nullopt, JAXX_Scaling_Mode::NO_SCALING, + num_gemms, outShape); // printf("rhs_shape: [%zu, %zu], lhs_shape: [%zu, %zu], out_shape: [%zu, %zu]\n", // rhsShape.data[0], rhsShape.data[1], @@ -730,17 +734,11 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type // TODO(jberchtold): make this memset smaller by only zeroing the expert weights that correspond to groups with size zero. cudaMemsetAsync(output->untyped_data(), 0, output->size_bytes(), stream); - nvte_grouped_gemm( - lhs_tensor, lhs_is_trans, - rhs_tensor, rhs_is_trans, - nullptr, - out_tensor, - alpha_tensor.data(), - beta_tensor.data(), - workspace_setup.data(), - workspace_cublas.data(), - nullptr, // config (use defaults) - stream); + nvte_grouped_gemm(lhs_tensor, lhs_is_trans, rhs_tensor, rhs_is_trans, nullptr, out_tensor, + alpha_tensor.data(), beta_tensor.data(), workspace_setup.data(), + workspace_cublas.data(), + nullptr, // config (use defaults) + stream); cudaStreamSynchronize(stream); @@ -750,10 +748,9 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type // Nominal case for FWD or DGRAD //// RHS - NVTEShape rhsShape{.data={num_gemms * k, n}, .ndim=2}; + NVTEShape rhsShape{.data = {num_gemms * k, n}, .ndim = 2}; // rhs_is_trans = true; - if (rhs_is_trans) { rhsShape.data[0] = num_gemms * n; rhsShape.data[1] = k; @@ -763,7 +760,7 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type auto rhs_tensor = make_grouped_tensor(rhs_data, rhs_sinv, scaling_mode, num_gemms, rhsShape); //// LHS - NVTEShape lhsShape{.data={m, k}, .ndim=2}; + NVTEShape lhsShape{.data = {m, k}, .ndim = 2}; // NVTE_CHECK(lhs_is_trans, "GroupedGemmFFI currently only supports lhs_is_trans=true"); // lhs_is_trans = true; if (!lhs_is_trans) { @@ -775,11 +772,13 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type // return ffi_with_cuda_error_check(); // } auto lhs_tensor = make_grouped_tensor(lhs_data, lhs_sinv, scaling_mode, num_gemms, lhsShape); - lhs_tensor.set_group_info(group_sizes, group_offset_lhs, lhs_is_trans ? kNVTEGroupedFirstDims : kNVTEGroupedLastDims); + lhs_tensor.set_group_info(group_sizes, group_offset_lhs, + lhs_is_trans ? kNVTEGroupedFirstDims : kNVTEGroupedLastDims); //// OUTPUT - NVTEShape outShape{.data={m, n}, .ndim=2}; - auto out_tensor = make_grouped_tensor(*output, std::nullopt, JAXX_Scaling_Mode::NO_SCALING, num_gemms, outShape); + NVTEShape outShape{.data = {m, n}, .ndim = 2}; + auto out_tensor = make_grouped_tensor(*output, std::nullopt, JAXX_Scaling_Mode::NO_SCALING, + num_gemms, outShape); out_tensor.set_group_info(group_sizes, group_offset_out, kNVTEGroupedFirstDims); // printf("rhs_shape: [%zu, %zu], lhs_shape: [%zu, %zu], out_shape: [%zu, %zu]\n", @@ -793,35 +792,31 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type cudaMemsetAsync(output->untyped_data(), 0, output->size_bytes(), stream); std::vector host_group_sizes(num_gemms); - cudaMemcpyAsync(host_group_sizes.data(), group_sizes.untyped_data(), - num_gemms * sizeof(int32_t), cudaMemcpyDeviceToHost, stream); + cudaMemcpyAsync(host_group_sizes.data(), group_sizes.untyped_data(), num_gemms * sizeof(int32_t), + cudaMemcpyDeviceToHost, stream); cudaStreamSynchronize(stream); int currentDevice; cudaGetDevice(¤tDevice); - printf("[gpu=%d] Group sizes[total_group_size=%zu, m=%zu]: ", currentDevice, std::accumulate(host_group_sizes.begin(), host_group_sizes.end(), 0ULL), m); + printf("[gpu=%d] Group sizes[total_group_size=%zu, m=%zu]: ", currentDevice, + std::accumulate(host_group_sizes.begin(), host_group_sizes.end(), 0ULL), m); for (size_t i = 0; i < num_gemms; ++i) { printf("%d, ", host_group_sizes[i]); } printf("\n"); - nvte_grouped_gemm( - lhs_tensor, lhs_is_trans, - rhs_tensor, rhs_is_trans, - nullptr, - out_tensor, - alpha_tensor.data(), - beta_tensor.data(), - workspace_setup.data(), - workspace_cublas.data(), - nullptr, // config (use defaults) - stream); - size_t _offset = std::accumulate(host_group_sizes.begin(), host_group_sizes.end(), 0ULL) * n * out_dtype_bytes; + nvte_grouped_gemm(lhs_tensor, lhs_is_trans, rhs_tensor, rhs_is_trans, nullptr, out_tensor, + alpha_tensor.data(), beta_tensor.data(), workspace_setup.data(), + workspace_cublas.data(), + nullptr, // config (use defaults) + stream); + size_t _offset = + std::accumulate(host_group_sizes.begin(), host_group_sizes.end(), 0ULL) * n * out_dtype_bytes; cudaMemsetAsync(output->untyped_data() + _offset, 0, output->size_bytes() - _offset, stream); - std::vector<__bf16> debug_output(m*n); - cudaMemcpyAsync(debug_output.data(), output->untyped_data(), - m * n * out_dtype_bytes, cudaMemcpyDeviceToHost, stream); + std::vector<__bf16> debug_output(m * n); + cudaMemcpyAsync(debug_output.data(), output->untyped_data(), m * n * out_dtype_bytes, + cudaMemcpyDeviceToHost, stream); cudaStreamSynchronize(stream); size_t totalPrints = 0; @@ -829,8 +824,10 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type for (size_t i_m = 0; i_m < m; i_m++) { for (size_t i_n = 0; i_n < n; i_n++) { size_t index = i_m * n + i_n; - if (isnan(static_cast(debug_output[index])) || isinf(static_cast(debug_output[index]))) { - printf("[gpu=%d] Output contains NaN or Inf at index [%zu, %zu] (flat index %zu)\n", i_m, i_n, index); + if (isnan(static_cast(debug_output[index])) || + isinf(static_cast(debug_output[index]))) { + printf("[gpu=%d] Output contains NaN or Inf at index [%zu, %zu] (flat index %zu)\n", i_m, + i_n, index); totalPrints++; if (totalPrints >= MAX_PRINTS) { break; diff --git a/transformer_engine/jax/csrc/extensions/quantization.cpp b/transformer_engine/jax/csrc/extensions/quantization.cpp index da2dd7e725..73c04b12e8 100644 --- a/transformer_engine/jax/csrc/extensions/quantization.cpp +++ b/transformer_engine/jax/csrc/extensions/quantization.cpp @@ -400,12 +400,11 @@ Error_Type GroupedQuantizeFFI(cudaStream_t stream, Buffer_Type inputs, Buffer_Ty size_t total_rowwise_sinv_size = 0; size_t total_colwise_sinv_size = 0; - // TODO(jberchtold): This is a temporary fix to zero out the output buffers to prevent NaNs in output when this buffer is over-allocated and the groups do not fill the whole buffer. Though these NaNs should be ignored in the downstream GEMM, so more debugging is needed to see why they cause issues. - size_t used_output_size = (sum_group_sizes*non_group_m) * n * output_dtype_bytes; - cudaMemsetAsync(outputs->untyped_data() + used_output_size, 0, outputs->size_bytes() - used_output_size, stream); + size_t used_output_size = (sum_group_sizes * non_group_m) * n * output_dtype_bytes; + cudaMemsetAsync(outputs->untyped_data() + used_output_size, 0, + outputs->size_bytes() - used_output_size, stream); - for (size_t i = 0; i < num_groups; i++) { size_t m_i = dim_list_host[i] * non_group_m; // Skip for zero-size input + shiff the scale ptr diff --git a/transformer_engine/jax/dense.py b/transformer_engine/jax/dense.py index ed1e5dfc38..b438a435ae 100644 --- a/transformer_engine/jax/dense.py +++ b/transformer_engine/jax/dense.py @@ -237,46 +237,46 @@ def _dense_fwd_rule( ) return output, ctx -def dot_general_transpose_lhs(g, x, y, *, dimension_numbers, - swap_ans=False): - # from: https://github.com/google/flax/blob/main/flax/linen/fp8_ops.py#L198 - import itertools - import numpy as np - def _remaining(original, *removed_lists): - removed = set(itertools.chain(*removed_lists)) - return tuple(i for i in original if i not in removed) - - def _ranges_like(*xs): - start = 0 - for x in xs: - x_len = len(x) - yield tuple(range(start, start + x_len)) - start += x_len - - (x_contract, y_contract), (x_batch, y_batch) = dimension_numbers - x_ndim = x.ndim - x_kept = _remaining(tuple(range(x_ndim)), x_contract, x_batch) - y_kept = _remaining(tuple(range(y.ndim)), y_contract, y_batch) - if swap_ans: - ans_batch, ans_y, _ = _ranges_like(x_batch, y_kept, x_kept) - else: - ans_batch, _, ans_y = _ranges_like(x_batch, x_kept, y_kept) - dims = ((ans_y, y_kept), (ans_batch, y_batch)) - x_contract_sorted_by_y = tuple(np.take(x_contract, np.argsort(y_contract))) - out_axes = np.argsort(tuple(x_batch) + x_kept + x_contract_sorted_by_y) - x_bar = jax.lax.transpose( - tex.gemm(g, y, contracting_dims=dims[0]), - tuple(out_axes) - ) - return x_bar + +def dot_general_transpose_lhs(g, x, y, *, dimension_numbers, swap_ans=False): + # from: https://github.com/google/flax/blob/main/flax/linen/fp8_ops.py#L198 + import itertools + import numpy as np + + def _remaining(original, *removed_lists): + removed = set(itertools.chain(*removed_lists)) + return tuple(i for i in original if i not in removed) + + def _ranges_like(*xs): + start = 0 + for x in xs: + x_len = len(x) + yield tuple(range(start, start + x_len)) + start += x_len + + (x_contract, y_contract), (x_batch, y_batch) = dimension_numbers + x_ndim = x.ndim + x_kept = _remaining(tuple(range(x_ndim)), x_contract, x_batch) + y_kept = _remaining(tuple(range(y.ndim)), y_contract, y_batch) + if swap_ans: + ans_batch, ans_y, _ = _ranges_like(x_batch, y_kept, x_kept) + else: + ans_batch, _, ans_y = _ranges_like(x_batch, x_kept, y_kept) + dims = ((ans_y, y_kept), (ans_batch, y_batch)) + x_contract_sorted_by_y = tuple(np.take(x_contract, np.argsort(y_contract))) + out_axes = np.argsort(tuple(x_batch) + x_kept + x_contract_sorted_by_y) + x_bar = jax.lax.transpose(tex.gemm(g, y, contracting_dims=dims[0]), tuple(out_axes)) + return x_bar + def dot_general_transpose_rhs(g, x, y, *, dimension_numbers): - (x_contract, y_contract), (x_batch, y_batch) = dimension_numbers - swapped_dimension_numbers = ((y_contract, x_contract), (y_batch, x_batch)) - y_bar = dot_general_transpose_lhs( - g, y, x, dimension_numbers=swapped_dimension_numbers, - swap_ans=True) - return y_bar + (x_contract, y_contract), (x_batch, y_batch) = dimension_numbers + swapped_dimension_numbers = ((y_contract, x_contract), (y_batch, x_batch)) + y_bar = dot_general_transpose_lhs( + g, y, x, dimension_numbers=swapped_dimension_numbers, swap_ans=True + ) + return y_bar + def _dense_bwd_rule( contracting_dims, @@ -318,7 +318,7 @@ def _dense_bwd_rule( ) fwd_cdims = (fwd_x_contracting_dims, fwd_k_contracting_dims) - batch_dims = ((), ()) # vmap is done outside dense VJP if needed + batch_dims = ((), ()) # vmap is done outside dense VJP if needed dims = (fwd_cdims, batch_dims) dgrad = dot_general_transpose_lhs( @@ -329,7 +329,9 @@ def _dense_bwd_rule( ) wgrad = dot_general_transpose_rhs( - casted_grad.get_tensor(usage=TensorUsage.LHS), # TODO(jberchtold): should be RHS to use fused kernel for 2x layout? but would need to update dims accordingly + casted_grad.get_tensor( + usage=TensorUsage.LHS + ), # TODO(jberchtold): should be RHS to use fused kernel for 2x layout? but would need to update dims accordingly casted_x_lhs, casted_kernel_rhs, dimension_numbers=dims, diff --git a/transformer_engine/jax/flax/__init__.py b/transformer_engine/jax/flax/__init__.py index 4952357264..3b64e49482 100644 --- a/transformer_engine/jax/flax/__init__.py +++ b/transformer_engine/jax/flax/__init__.py @@ -4,7 +4,12 @@ """Transformer Engine bindings for JAX""" from .module import DenseGeneral, LayerNorm from .module import LayerNormDenseGeneral, LayerNormMLP -from .module import wrap_function_in_te_state_module, make_dot_general_cls, make_einsum_cls, make_ragged_dot_cls +from .module import ( + wrap_function_in_te_state_module, + make_dot_general_cls, + make_einsum_cls, + make_ragged_dot_cls, +) from .transformer import extend_logical_axis_rules from .transformer import DotProductAttention, MultiHeadAttention, RelativePositionBiases from .transformer import TransformerLayer, TransformerLayerType diff --git a/transformer_engine/jax/flax/module.py b/transformer_engine/jax/flax/module.py index 82be17006c..c4f326ea23 100644 --- a/transformer_engine/jax/flax/module.py +++ b/transformer_engine/jax/flax/module.py @@ -1442,87 +1442,102 @@ def te_dot_general(generate_quantizer_set, x, kernel, dims, **kwargs): return wrap_function_in_te_state_module(te_dot_general, quantization_recipe, "dot_general") + def make_einsum_cls(quantization_recipe): import functools import math import jax + def te_einsum(generate_quantizer_set, s, x, kernel, **kwargs): - # with open("/tmp/te_einsum_log.txt", "a") as f: - # f.write(f"{(s, x.shape, kernel.shape)}\n") - def dot_general(x, kernel, dims, *args, **kwargs): - # print(f"TE dot_general called with dims: {dims}, args: {args}, kwargs: {kwargs}") - contracting_dims, batch_dims = dims - ((x_bdim,), (k_bdim,)) = batch_dims - batch_dims = (x_bdim, k_bdim) - - if x_bdim != 0 or k_bdim != 0: - print(f"{x_bdim=}, {k_bdim=}") - return jax.lax.dot_general(x, kernel, dims, *args, **kwargs) - - target_out_shape = jax.lax.dot_general(x, kernel, dims).shape - - if x.dtype not in [jnp.float16, jnp.bfloat16, jnp.float32, jnp.float64]: - # HACK: because x input is bool for dispatch mask - x = x.astype(kernel.dtype) - - # Adjust for unbatched - contracting_dims = tuple( - tuple(dim - (1 if dim > bdim else 0) for dim in cdims) - for bdim, cdims in zip(batch_dims, contracting_dims)) - - group_sizes = None - print(f'{x.shape=}, {kernel.shape=}, {dims=}') - - def reorder_lhs_for_grouped_gemm(tensor, cdims): - # (B*M, K) - assert len(cdims) == 1, f"Only support single contracting dim for now, got {cdims}" - cdim = cdims[0] + 1 # account for batch dim at front - out = jnp.transpose(tensor, tuple(range(cdim)) + tuple(range(cdim + 1, tensor.ndim)) + (cdim,)) - return out.reshape((-1, out.shape[-1])) - - - def reorder_rhs_for_grouped_gemm(tensor, bdims, cdims): - # (B, K, N) - assert len(bdims) == 1 and len(cdims) == 1, f"Only support single batch and contracting dim for now, got {bdims}, {cdims}" - bdim = bdims[0] - assert bdim == 0, f"Only support batch dim 0 for now, got {bdim}" - cdim = cdims[0] + 1 # account for batch dim at front - out = jnp.transpose(tensor, (bdim, cdim) + tuple(i for i in range(tensor.ndim) if i != bdim and i != cdim)) - return out.reshape((*out.shape[:2], -1)) - - x = reorder_lhs_for_grouped_gemm(x, contracting_dims[0]) - kernel = reorder_rhs_for_grouped_gemm(kernel, (batch_dims[1],), contracting_dims[1]) - - num_groups = kernel.shape[0] - group_size = x.shape[1] - print(f'{num_groups=}, {group_size=}, {x.shape=}, {kernel.shape=}') - - group_sizes = jnp.array([group_size]*num_groups, dtype=jnp.int32) + # with open("/tmp/te_einsum_log.txt", "a") as f: + # f.write(f"{(s, x.shape, kernel.shape)}\n") + def dot_general(x, kernel, dims, *args, **kwargs): + # print(f"TE dot_general called with dims: {dims}, args: {args}, kwargs: {kwargs}") + contracting_dims, batch_dims = dims + ((x_bdim,), (k_bdim,)) = batch_dims + batch_dims = (x_bdim, k_bdim) + + if x_bdim != 0 or k_bdim != 0: + print(f"{x_bdim=}, {k_bdim=}") + return jax.lax.dot_general(x, kernel, dims, *args, **kwargs) + + target_out_shape = jax.lax.dot_general(x, kernel, dims).shape + + if x.dtype not in [jnp.float16, jnp.bfloat16, jnp.float32, jnp.float64]: + # HACK: because x input is bool for dispatch mask + x = x.astype(kernel.dtype) + + # Adjust for unbatched + contracting_dims = tuple( + tuple(dim - (1 if dim > bdim else 0) for dim in cdims) + for bdim, cdims in zip(batch_dims, contracting_dims) + ) - quantizer_set = generate_quantizer_set(n_groups=num_groups) + group_sizes = None + print(f"{x.shape=}, {kernel.shape=}, {dims=}") + + def reorder_lhs_for_grouped_gemm(tensor, cdims): + # (B*M, K) + assert len(cdims) == 1, f"Only support single contracting dim for now, got {cdims}" + cdim = cdims[0] + 1 # account for batch dim at front + out = jnp.transpose( + tensor, tuple(range(cdim)) + tuple(range(cdim + 1, tensor.ndim)) + (cdim,) + ) + return out.reshape((-1, out.shape[-1])) + + def reorder_rhs_for_grouped_gemm(tensor, bdims, cdims): + # (B, K, N) + assert ( + len(bdims) == 1 and len(cdims) == 1 + ), f"Only support single batch and contracting dim for now, got {bdims}, {cdims}" + bdim = bdims[0] + assert bdim == 0, f"Only support batch dim 0 for now, got {bdim}" + cdim = cdims[0] + 1 # account for batch dim at front + out = jnp.transpose( + tensor, + (bdim, cdim) + tuple(i for i in range(tensor.ndim) if i != bdim and i != cdim), + ) + return out.reshape((*out.shape[:2], -1)) - print(f'{group_sizes=}, {contracting_dims=}, {x.shape=}, {kernel.shape=}, {contracting_dims=}') + x = reorder_lhs_for_grouped_gemm(x, contracting_dims[0]) + kernel = reorder_rhs_for_grouped_gemm(kernel, (batch_dims[1],), contracting_dims[1]) + + num_groups = kernel.shape[0] + group_size = x.shape[1] + print(f"{num_groups=}, {group_size=}, {x.shape=}, {kernel.shape=}") + + group_sizes = jnp.array([group_size] * num_groups, dtype=jnp.int32) + + quantizer_set = generate_quantizer_set(n_groups=num_groups) + + print( + f"{group_sizes=}, {contracting_dims=}, {x.shape=}, {kernel.shape=}," + f" {contracting_dims=}" + ) + + contracting_dims = ( + # (B*M, K) + (1,), + # (B, K, N) + (1,), + ) + out = grouped_dense( + x, + kernel, + group_sizes=group_sizes, + contracting_dims=contracting_dims, + quantizer_set=quantizer_set, + ) + return out.reshape(target_out_shape) + + return jnp.einsum(s, x, kernel, _dot_general=dot_general, **kwargs) - contracting_dims = ( - # (B*M, K) - (1,), - # (B, K, N) - (1,), - ) - out = grouped_dense( - x, - kernel, - group_sizes=group_sizes, - contracting_dims=contracting_dims, - quantizer_set=quantizer_set - ) - return out.reshape(target_out_shape) - return jnp.einsum(s, x, kernel, _dot_general=dot_general, **kwargs) - return wrap_function_in_te_state_module(te_einsum, quantization_recipe, "einsum")() + def make_ragged_dot_cls(quantization_recipe): import jax + def te_grouped_dot_general(generate_quantizer_set, x, kernel, group_sizes, **kwargs): num_groups = group_sizes.shape[0] quantizer_set = generate_quantizer_set(n_groups=num_groups) @@ -1530,13 +1545,15 @@ def te_grouped_dot_general(generate_quantizer_set, x, kernel, group_sizes, **kwa target_out_shape = jax.lax.ragged_dot(x, kernel, group_sizes=group_sizes).shape out = grouped_dense( - x, - kernel, - group_sizes=group_sizes, - contracting_dims=((1,), (1,)), - # quantizer_set=quantizer_set + x, + kernel, + group_sizes=group_sizes, + contracting_dims=((1,), (1,)), + # quantizer_set=quantizer_set ) return out.reshape(target_out_shape) - - return wrap_function_in_te_state_module(te_grouped_dot_general, quantization_recipe, "ragged_dot")() + + return wrap_function_in_te_state_module( + te_grouped_dot_general, quantization_recipe, "ragged_dot" + )() diff --git a/transformer_engine/jax/rtc/__init__.py b/transformer_engine/jax/rtc/__init__.py index 795e5e7efc..62d9367ccc 100644 --- a/transformer_engine/jax/rtc/__init__.py +++ b/transformer_engine/jax/rtc/__init__.py @@ -1 +1 @@ -from .rtc import compile_extension \ No newline at end of file +from .rtc import compile_extension diff --git a/transformer_engine/jax/rtc/rtc.py b/transformer_engine/jax/rtc/rtc.py index a76f8b318c..b1666118e5 100644 --- a/transformer_engine/jax/rtc/rtc.py +++ b/transformer_engine/jax/rtc/rtc.py @@ -112,8 +112,10 @@ def setup_jax_extension( libraries=["nccl"], ) + _compiled = False + def compile_extension(): import os import shutil @@ -148,10 +150,14 @@ def compile_extension(): os.makedirs(cmd.build_lib, exist_ok=True) cmd.run() - subprocess.call([ - "cp", - os.path.join(cmd.build_lib, "transformer_engine_jax" + cmd.get_ext_filename(fullname="")), - base_dir, - ]) + subprocess.call( + [ + "cp", + os.path.join( + cmd.build_lib, "transformer_engine_jax" + cmd.get_ext_filename(fullname="") + ), + base_dir, + ] + ) - _compiled = True \ No newline at end of file + _compiled = True diff --git a/transformer_engine/jax/sharding.py b/transformer_engine/jax/sharding.py index a022e1d7e8..80133cdbc4 100644 --- a/transformer_engine/jax/sharding.py +++ b/transformer_engine/jax/sharding.py @@ -50,9 +50,8 @@ def _get_mesh_info(resource: str, mesh: jax.sharding.Mesh): assert resource in mesh.axis_names, f"{resource} is not in the axis_names of Mesh {mesh}." return mesh.shape[resource], resource - -# TODO(jberchtold): FIXME, this validation fails in FP8CS amax reduction because the GlobalMeshResource is set but there is no active mesh in the context (afaict shard_map does not share it's mesh as a context), so this is triggering a FalsePositive assert. However, I am not sure if we can safely ignore this when the mesh is empty or all axes are manual as some users may use shard_map with some axes manual and some auto. -# def _validate_mesh_resource_configuration(mesh_resource): + # TODO(jberchtold): FIXME, this validation fails in FP8CS amax reduction because the GlobalMeshResource is set but there is no active mesh in the context (afaict shard_map does not share it's mesh as a context), so this is triggering a FalsePositive assert. However, I am not sure if we can safely ignore this when the mesh is empty or all axes are manual as some users may use shard_map with some axes manual and some auto. + # def _validate_mesh_resource_configuration(mesh_resource): """Validate that the mesh resource configuration is consistent and conflict-free.""" is_tp_enabled = ( mesh_resource.tp_resource is not None and get_mesh_axis_size(mesh_resource.tp_resource) > 1 From 2958895077aaa6c659a90c6b8b87fcc6726502e0 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Fri, 6 Feb 2026 10:36:43 -0800 Subject: [PATCH 82/98] bf16 dsv3 ep=2 fsdp=2 trains without nan in loss --- transformer_engine/jax/cpp_extensions/gemm.py | 10 +++++++++- transformer_engine/jax/csrc/extensions/gemm.cpp | 16 ++++++++-------- transformer_engine/jax/dense.py | 4 ++-- 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py index 83b3e7e700..73b0475bef 100644 --- a/transformer_engine/jax/cpp_extensions/gemm.py +++ b/transformer_engine/jax/cpp_extensions/gemm.py @@ -985,7 +985,15 @@ def _parse_operand_output_specs( # Non-contracting dims of RHS always needs to be gathered along the FSDP axis rhs_non_cspecs = tuple( - None if spec is not None and spec == gsr.fsdp_resource else spec + ( + None + if spec is not None + and ( + spec == gsr.fsdp_resource + or (isinstance(spec, tuple) and gsr.fsdp_resource in spec) + ) + else spec + ) for spec in rhs_non_cspecs ) diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index 3048409ea4..4ba6fee2d8 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -796,14 +796,14 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type cudaMemcpyDeviceToHost, stream); cudaStreamSynchronize(stream); - int currentDevice; - cudaGetDevice(¤tDevice); - printf("[gpu=%d] Group sizes[total_group_size=%zu, m=%zu]: ", currentDevice, - std::accumulate(host_group_sizes.begin(), host_group_sizes.end(), 0ULL), m); - for (size_t i = 0; i < num_gemms; ++i) { - printf("%d, ", host_group_sizes[i]); - } - printf("\n"); + // int currentDevice; + // cudaGetDevice(¤tDevice); + // printf("[gpu=%d] Group sizes[total_group_size=%zu, m=%zu]: ", currentDevice, + // std::accumulate(host_group_sizes.begin(), host_group_sizes.end(), 0ULL), m); + // for (size_t i = 0; i < num_gemms; ++i) { + // printf("%d, ", host_group_sizes[i]); + // } + // printf("\n"); nvte_grouped_gemm(lhs_tensor, lhs_is_trans, rhs_tensor, rhs_is_trans, nullptr, out_tensor, alpha_tensor.data(), beta_tensor.data(), workspace_setup.data(), diff --git a/transformer_engine/jax/dense.py b/transformer_engine/jax/dense.py index b438a435ae..e163f8dcd2 100644 --- a/transformer_engine/jax/dense.py +++ b/transformer_engine/jax/dense.py @@ -667,8 +667,8 @@ def _grouped_dense_bwd_rule( dkernel_amax = None # HACK - dgrad = jnp.zeros_like(dgrad) - wgrad = jnp.zeros_like(wgrad) + # dgrad = jnp.zeros_like(dgrad) + # wgrad = jnp.zeros_like(wgrad) return dgrad, wgrad, group_sizes_grad, dbias, dkernel_amax, quantizer_set From 5c623b28d5943a6b4d36486031b8275f76517569 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Fri, 6 Feb 2026 12:27:19 -0800 Subject: [PATCH 83/98] fp8cs no longer produces nans for ep=2 fsdp=2 --- .../jax/csrc/extensions/gemm.cpp | 48 +++++++++---------- transformer_engine/jax/flax/module.py | 2 +- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index 4ba6fee2d8..4c888ee0de 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -814,30 +814,30 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type std::accumulate(host_group_sizes.begin(), host_group_sizes.end(), 0ULL) * n * out_dtype_bytes; cudaMemsetAsync(output->untyped_data() + _offset, 0, output->size_bytes() - _offset, stream); - std::vector<__bf16> debug_output(m * n); - cudaMemcpyAsync(debug_output.data(), output->untyped_data(), m * n * out_dtype_bytes, - cudaMemcpyDeviceToHost, stream); - cudaStreamSynchronize(stream); - - size_t totalPrints = 0; - constexpr size_t MAX_PRINTS = 1; - for (size_t i_m = 0; i_m < m; i_m++) { - for (size_t i_n = 0; i_n < n; i_n++) { - size_t index = i_m * n + i_n; - if (isnan(static_cast(debug_output[index])) || - isinf(static_cast(debug_output[index]))) { - printf("[gpu=%d] Output contains NaN or Inf at index [%zu, %zu] (flat index %zu)\n", i_m, - i_n, index); - totalPrints++; - if (totalPrints >= MAX_PRINTS) { - break; - } - } - } - if (totalPrints >= MAX_PRINTS) { - break; - } - } + // std::vector<__bf16> debug_output(m * n); + // cudaMemcpyAsync(debug_output.data(), output->untyped_data(), m * n * out_dtype_bytes, + // cudaMemcpyDeviceToHost, stream); + // cudaStreamSynchronize(stream); + + // size_t totalPrints = 0; + // constexpr size_t MAX_PRINTS = 1; + // for (size_t i_m = 0; i_m < m; i_m++) { + // for (size_t i_n = 0; i_n < n; i_n++) { + // size_t index = i_m * n + i_n; + // if (isnan(static_cast(debug_output[index])) || + // isinf(static_cast(debug_output[index]))) { + // printf("[gpu=%d] Output contains NaN or Inf at index [%zu, %zu] (flat index %zu)\n", i_m, + // i_n, index); + // totalPrints++; + // if (totalPrints >= MAX_PRINTS) { + // break; + // } + // } + // } + // if (totalPrints >= MAX_PRINTS) { + // break; + // } + // } return ffi_with_cuda_error_check(); } diff --git a/transformer_engine/jax/flax/module.py b/transformer_engine/jax/flax/module.py index c4f326ea23..09e57fe94a 100644 --- a/transformer_engine/jax/flax/module.py +++ b/transformer_engine/jax/flax/module.py @@ -1549,7 +1549,7 @@ def te_grouped_dot_general(generate_quantizer_set, x, kernel, group_sizes, **kwa kernel, group_sizes=group_sizes, contracting_dims=((1,), (1,)), - # quantizer_set=quantizer_set + quantizer_set=quantizer_set ) return out.reshape(target_out_shape) From c3fe902ecf442dc6949cb9a2e2fc78a6660e80e3 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Mon, 9 Feb 2026 12:00:47 -0800 Subject: [PATCH 84/98] refactor Signed-off-by: Jeremy Berchtold --- .../common/gemm/cublaslt_gemm.cu | 8 +- transformer_engine/jax/cpp_extensions/gemm.py | 8 +- .../jax/csrc/extensions/amax.cpp | 45 ----------- .../jax/csrc/extensions/inspect.cpp | 79 +++++++++++++++++++ .../jax/debug/experimental/__init__.py | 14 ++++ .../jax/{ => debug/experimental}/inspect.py | 4 +- 6 files changed, 103 insertions(+), 55 deletions(-) create mode 100644 transformer_engine/jax/csrc/extensions/inspect.cpp create mode 100644 transformer_engine/jax/debug/experimental/__init__.py rename transformer_engine/jax/{ => debug/experimental}/inspect.py (95%) diff --git a/transformer_engine/common/gemm/cublaslt_gemm.cu b/transformer_engine/common/gemm/cublaslt_gemm.cu index c58c3cb47a..11543be501 100644 --- a/transformer_engine/common/gemm/cublaslt_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_gemm.cu @@ -154,8 +154,8 @@ GemmParam CanonicalizeGemmInput(const transformer_engine::Tensor &A, const cubla if (is_fp8_dtype(ret.Atype)) { // Requirements from https://docs.nvidia.com/cuda/cublas/#tensor-core-usage - NVTE_CHECK(ret.lda % 16 == 0, - "Leading dimension requirement on A for FP8 GEMM. Caller must pad."); + // NVTE_CHECK(ret.lda % 16 == 0, + // "Leading dimension requirement on A for FP8 GEMM. Caller must pad."); } } else if (nvfp4) { // NVFP4 GEMM. Either the pure NVFP4 recipe or the FWD pass of the Hybrid NVFP4/MXFP8 recipe. @@ -245,8 +245,8 @@ GemmParam CanonicalizeGemmInput(const transformer_engine::Tensor &A, const cubla if (is_fp8_dtype(ret.Atype)) { // Requirements from https://docs.nvidia.com/cuda/cublas/#tensor-core-usage - NVTE_CHECK(ret.ldb % 16 == 0, - "Leading dimension requirement on B for FP8 GEMM. Caller must pad."); + // NVTE_CHECK(ret.ldb % 16 == 0, + // "Leading dimension requirement on B for FP8 GEMM. Caller must pad."); } } else if (nvfp4) { if (is_B_transposed) { diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py index d400412386..71f133bfc4 100644 --- a/transformer_engine/jax/cpp_extensions/gemm.py +++ b/transformer_engine/jax/cpp_extensions/gemm.py @@ -373,10 +373,10 @@ def assert_cublas_requirements(scaling_mode, contracting_size, tensor_name): # Requirements from https://docs.nvidia.com/cuda/cublas/#tensor-core-usage alignment = 32 if scaling_mode.is_nvfp4_scaling else 16 - # assert contracting_size % alignment == 0, ( - # f"cuBLAS GEMM {tensor_name} tensor's contracting dimension must be a multiple of" - # f" {alignment} when using quantized inputs. Got contracting_size={contracting_size}" - # ) + assert contracting_size % alignment == 0, ( + f"cuBLAS GEMM {tensor_name} tensor's contracting dimension must be a multiple of" + f" {alignment} when using quantized inputs. Got contracting_size={contracting_size}" + ) class GemmPrimitive(BasePrimitive): diff --git a/transformer_engine/jax/csrc/extensions/amax.cpp b/transformer_engine/jax/csrc/extensions/amax.cpp index e18ee99d81..58c89cfd32 100644 --- a/transformer_engine/jax/csrc/extensions/amax.cpp +++ b/transformer_engine/jax/csrc/extensions/amax.cpp @@ -5,9 +5,6 @@ ************************************************************************/ #include -#include -#include - #include "../extensions.h" #include "transformer_engine/cast.h" #include "transformer_engine/hadamard_transform.h" @@ -97,47 +94,5 @@ XLA_FFI_DEFINE_HANDLER_SYMBOL( .Attr("produce_regular_amax") // produce_regular_amax .Attr("flatten_axis")); // flatten_axis -Error_Type InspectFFI(cudaStream_t stream, Buffer_Type input_buf, Result_Type output_buf) { - NVTE_CHECK(input_buf.untyped_data() != nullptr, "Input must be provided for inspect operation"); - NVTE_CHECK(output_buf->untyped_data() != nullptr, - "Output must be provided for inspect operation"); - NVTE_CHECK(input_buf.untyped_data() == output_buf->untyped_data(), - "Input and output must point to the same buffer for inspect operation"); - - std::vector input_data(input_buf.size_bytes()); - cudaMemcpyAsync(input_data.data(), input_buf.untyped_data(), input_buf.size_bytes(), - cudaMemcpyDeviceToHost, stream); - cudaStreamSynchronize(stream); - - int device; - cudaGetDevice(&device); - - std::string filename = "my_tensor_gpu" + std::to_string(device) + ".bin"; - std::ofstream file(filename, std::ios::binary); - if (file.is_open()) { - file.write(reinterpret_cast(input_data.data()), input_data.size()); - file.close(); - } - printf("Tensor data written to %s (shape: [", filename.c_str()); - for (size_t i = 0; i < input_buf.dimensions().size(); ++i) { - printf("%ld", static_cast(input_buf.dimensions()[i])); - if (i < input_buf.dimensions().size() - 1) { - printf(", "); - } - } - printf("], dtype: %d)\n", static_cast(input_buf.element_type())); - - // TODO: make a metadata file with tensor shape and dtype? - - return ffi_with_cuda_error_check(); -} - -XLA_FFI_DEFINE_HANDLER_SYMBOL(InspectHandler, InspectFFI, - FFI::Bind() - .Ctx() // stream - .Arg() // input - .Ret() // output -); - } // namespace jax } // namespace transformer_engine diff --git a/transformer_engine/jax/csrc/extensions/inspect.cpp b/transformer_engine/jax/csrc/extensions/inspect.cpp new file mode 100644 index 0000000000..4d4d96dee0 --- /dev/null +++ b/transformer_engine/jax/csrc/extensions/inspect.cpp @@ -0,0 +1,79 @@ +/************************************************************************* + * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ +#include + +#include +#include + +#include "../extensions.h" +#include "xla/ffi/api/c_api.h" + +namespace transformer_engine { +namespace jax { + +Error_Type InspectFFI(cudaStream_t stream, Buffer_Type input_buf, Result_Type output_buf) { + NVTE_CHECK(input_buf.untyped_data() != nullptr, "Input must be provided for inspect operation"); + NVTE_CHECK(output_buf->untyped_data() != nullptr, + "Output must be provided for inspect operation"); + NVTE_CHECK(input_buf.untyped_data() == output_buf->untyped_data(), + "Input and output must point to the same buffer for inspect operation"); + + std::vector input_data(input_buf.size_bytes()); + cudaMemcpyAsync(input_data.data(), input_buf.untyped_data(), input_buf.size_bytes(), + cudaMemcpyDeviceToHost, stream); + cudaStreamSynchronize(stream); + + int device; + cudaGetDevice(&device); + + // Write the tensor data to a file as a binary blob + std::string filename = "my_tensor_gpu" + std::to_string(device) + ".bin"; + std::ofstream file(filename, std::ios::binary); + if (file.is_open()) { + file.write(reinterpret_cast(input_data.data()), input_data.size()); + file.close(); + } + + // Write out a metadata file + std::string meta_filename = "my_tensor_gpu" + std::to_string(device) + "_meta.json"; + std::ofstream meta_file(meta_filename); + if (meta_file.is_open()) { + meta_file << "{"; + meta_file << "\"shape\": ["; + for (size_t i = 0; i < input_buf.dimensions().size(); ++i) { + meta_file << input_buf.dimensions()[i]; + if (i < input_buf.dimensions().size() - 1) { + meta_file << ", "; + } + } + meta_file << "], "; + meta_file << "\"dtype\": " << static_cast(input_buf.element_type()); + meta_file << "}"; + meta_file.close(); + } + + // Log the tensor metadata to the console + printf("Tensor data written to %s (shape: [", filename.c_str()); + for (size_t i = 0; i < input_buf.dimensions().size(); ++i) { + printf("%ld", static_cast(input_buf.dimensions()[i])); + if (i < input_buf.dimensions().size() - 1) { + printf(", "); + } + } + printf("], dtype: %d)\n", static_cast(input_buf.element_type())); + + return ffi_with_cuda_error_check(); +} + +XLA_FFI_DEFINE_HANDLER_SYMBOL(InspectHandler, InspectFFI, + FFI::Bind() + .Ctx() // stream + .Arg() // input + .Ret() // output +); + +} // namespace jax +} // namespace transformer_engine diff --git a/transformer_engine/jax/debug/experimental/__init__.py b/transformer_engine/jax/debug/experimental/__init__.py new file mode 100644 index 0000000000..44a4847660 --- /dev/null +++ b/transformer_engine/jax/debug/experimental/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. +"""EXPERIMENTAL debugging utilities for Transformer Engine JAX. + +This API is experimental and may change or be removed without deprecation in future releases. +""" + +from .inspect import inspect_array, load_array_dump + +__all__ = [ + "inspect_array", + "load_array_dump", +] diff --git a/transformer_engine/jax/inspect.py b/transformer_engine/jax/debug/experimental/inspect.py similarity index 95% rename from transformer_engine/jax/inspect.py rename to transformer_engine/jax/debug/experimental/inspect.py index 61bbaf8bb0..ddb1a5b069 100644 --- a/transformer_engine/jax/inspect.py +++ b/transformer_engine/jax/debug/experimental/inspect.py @@ -1,7 +1,7 @@ # Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # See LICENSE for license information. -"""JAX array inspection utilities.""" +"""Experimental JAX array inspection utilities.""" from functools import partial @@ -9,7 +9,7 @@ import jax.numpy as jnp from jax import ffi -from .cpp_extensions.base import BasePrimitive, register_primitive +from transformer_engine.jax.cpp_extensions.base import BasePrimitive, register_primitive __all__ = ["inspect_array", "load_array_dump"] From cf7be54fec5ddae347bc81b660b58d85744aab9a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 9 Feb 2026 20:01:52 +0000 Subject: [PATCH 85/98] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- transformer_engine/common/gemm/cublaslt_gemm.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformer_engine/common/gemm/cublaslt_gemm.cu b/transformer_engine/common/gemm/cublaslt_gemm.cu index 11543be501..241e30764a 100644 --- a/transformer_engine/common/gemm/cublaslt_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_gemm.cu @@ -155,7 +155,7 @@ GemmParam CanonicalizeGemmInput(const transformer_engine::Tensor &A, const cubla if (is_fp8_dtype(ret.Atype)) { // Requirements from https://docs.nvidia.com/cuda/cublas/#tensor-core-usage // NVTE_CHECK(ret.lda % 16 == 0, - // "Leading dimension requirement on A for FP8 GEMM. Caller must pad."); + // "Leading dimension requirement on A for FP8 GEMM. Caller must pad."); } } else if (nvfp4) { // NVFP4 GEMM. Either the pure NVFP4 recipe or the FWD pass of the Hybrid NVFP4/MXFP8 recipe. From cdf53f516edc1e122c100f292784f49a52dcf7fe Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Mon, 9 Feb 2026 13:48:13 -0800 Subject: [PATCH 86/98] Add tensor statistics Signed-off-by: Jeremy Berchtold --- .../common/gemm/cublaslt_gemm.cu | 8 ++--- .../jax/csrc/extensions/inspect.cpp | 26 ++++++++++++-- .../jax/debug/experimental/inspect.py | 34 ++++++++++++++++++- 3 files changed, 61 insertions(+), 7 deletions(-) diff --git a/transformer_engine/common/gemm/cublaslt_gemm.cu b/transformer_engine/common/gemm/cublaslt_gemm.cu index 241e30764a..c58c3cb47a 100644 --- a/transformer_engine/common/gemm/cublaslt_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_gemm.cu @@ -154,8 +154,8 @@ GemmParam CanonicalizeGemmInput(const transformer_engine::Tensor &A, const cubla if (is_fp8_dtype(ret.Atype)) { // Requirements from https://docs.nvidia.com/cuda/cublas/#tensor-core-usage - // NVTE_CHECK(ret.lda % 16 == 0, - // "Leading dimension requirement on A for FP8 GEMM. Caller must pad."); + NVTE_CHECK(ret.lda % 16 == 0, + "Leading dimension requirement on A for FP8 GEMM. Caller must pad."); } } else if (nvfp4) { // NVFP4 GEMM. Either the pure NVFP4 recipe or the FWD pass of the Hybrid NVFP4/MXFP8 recipe. @@ -245,8 +245,8 @@ GemmParam CanonicalizeGemmInput(const transformer_engine::Tensor &A, const cubla if (is_fp8_dtype(ret.Atype)) { // Requirements from https://docs.nvidia.com/cuda/cublas/#tensor-core-usage - // NVTE_CHECK(ret.ldb % 16 == 0, - // "Leading dimension requirement on B for FP8 GEMM. Caller must pad."); + NVTE_CHECK(ret.ldb % 16 == 0, + "Leading dimension requirement on B for FP8 GEMM. Caller must pad."); } } else if (nvfp4) { if (is_B_transposed) { diff --git a/transformer_engine/jax/csrc/extensions/inspect.cpp b/transformer_engine/jax/csrc/extensions/inspect.cpp index 4d4d96dee0..6a03407bca 100644 --- a/transformer_engine/jax/csrc/extensions/inspect.cpp +++ b/transformer_engine/jax/csrc/extensions/inspect.cpp @@ -14,7 +14,13 @@ namespace transformer_engine { namespace jax { -Error_Type InspectFFI(cudaStream_t stream, Buffer_Type input_buf, Result_Type output_buf) { +Error_Type InspectFFI(cudaStream_t stream, + Buffer_Type input_buf, + Buffer_Type min_buf, + Buffer_Type max_buf, + Buffer_Type mean_buf, + Buffer_Type std_buf, + Result_Type output_buf) { NVTE_CHECK(input_buf.untyped_data() != nullptr, "Input must be provided for inspect operation"); NVTE_CHECK(output_buf->untyped_data() != nullptr, "Output must be provided for inspect operation"); @@ -24,6 +30,13 @@ Error_Type InspectFFI(cudaStream_t stream, Buffer_Type input_buf, Result_Type ou std::vector input_data(input_buf.size_bytes()); cudaMemcpyAsync(input_data.data(), input_buf.untyped_data(), input_buf.size_bytes(), cudaMemcpyDeviceToHost, stream); + + float min_val{}, max_val{}, mean_val{}, std_val{}; + cudaMemcpyAsync(&min_val, min_buf.untyped_data(), sizeof(float), cudaMemcpyDeviceToHost, stream); + cudaMemcpyAsync(&max_val, max_buf.untyped_data(), sizeof(float), cudaMemcpyDeviceToHost, stream); + cudaMemcpyAsync(&mean_val, mean_buf.untyped_data(), sizeof(float), cudaMemcpyDeviceToHost, stream); + cudaMemcpyAsync(&std_val, std_buf.untyped_data(), sizeof(float), cudaMemcpyDeviceToHost, stream); + cudaStreamSynchronize(stream); int device; @@ -51,6 +64,10 @@ Error_Type InspectFFI(cudaStream_t stream, Buffer_Type input_buf, Result_Type ou } meta_file << "], "; meta_file << "\"dtype\": " << static_cast(input_buf.element_type()); + meta_file << ", \"min\": " << min_val; + meta_file << ", \"max\": " << max_val; + meta_file << ", \"mean\": " << mean_val; + meta_file << ", \"std\": " << std_val; meta_file << "}"; meta_file.close(); } @@ -63,7 +80,8 @@ Error_Type InspectFFI(cudaStream_t stream, Buffer_Type input_buf, Result_Type ou printf(", "); } } - printf("], dtype: %d)\n", static_cast(input_buf.element_type())); + printf("], dtype: %d", static_cast(input_buf.element_type())); + printf(", min: %f, max: %f, mean: %f, std: %f)\n", min_val, max_val, mean_val, std_val); return ffi_with_cuda_error_check(); } @@ -72,6 +90,10 @@ XLA_FFI_DEFINE_HANDLER_SYMBOL(InspectHandler, InspectFFI, FFI::Bind() .Ctx() // stream .Arg() // input + .Arg() // min + .Arg() // max + .Arg() // mean + .Arg() // std .Ret() // output ); diff --git a/transformer_engine/jax/debug/experimental/inspect.py b/transformer_engine/jax/debug/experimental/inspect.py index ddb1a5b069..c87d34285b 100644 --- a/transformer_engine/jax/debug/experimental/inspect.py +++ b/transformer_engine/jax/debug/experimental/inspect.py @@ -28,16 +28,28 @@ class InspectPrimitive(BasePrimitive): @staticmethod def abstract( x_aval, + x_min_aval, + x_max_aval, + x_mean_aval, + x_std_aval, ): """ inspect abstract """ + assert x_min_aval.shape == () and x_min_aval.dtype == jnp.float32, "x_min must be a scalar with dtype float32" + assert x_max_aval.shape == () and x_max_aval.dtype == jnp.float32, "x_max must be a scalar with dtype float32" + assert x_mean_aval.shape == () and x_mean_aval.dtype == jnp.float32, "x_mean must be a scalar with dtype float32" + assert x_std_aval.shape == () and x_std_aval.dtype == jnp.float32, "x_std must be a scalar with dtype float32" return x_aval @staticmethod def lowering( ctx, x, + x_min, + x_max, + x_mean, + x_std, ): """ inspect lowering rules @@ -49,11 +61,19 @@ def lowering( )( ctx, x, + x_min, + x_max, + x_mean, + x_std, ) @staticmethod def impl( x, + x_min, + x_max, + x_mean, + x_std, ): """ inspect implementation @@ -61,12 +81,24 @@ def impl( assert InspectPrimitive.inner_primitive is not None (x) = InspectPrimitive.inner_primitive.bind( x, + x_min, + x_max, + x_mean, + x_std, ) return x register_primitive(InspectPrimitive) +def _inspect_array_inner(x: jnp.ndarray) -> jnp.ndarray: + return InspectPrimitive.outer_primitive.bind( + x, + jnp.min(x).astype(jnp.float32), + jnp.max(x).astype(jnp.float32), + jnp.mean(x.astype(jnp.float32)), + jnp.std(x.astype(jnp.float32)), + ) @partial(jax.custom_vjp, nondiff_argnums=()) def _inspect( @@ -84,7 +116,7 @@ def _inspect_fwd_rule( ): """""" ctx = () - x = InspectPrimitive.outer_primitive.bind(x) + x = _inspect_array_inner(x) return x, ctx From 39a219498dfa3b655df5e289c91f1ed50f92ce15 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 9 Feb 2026 21:49:13 +0000 Subject: [PATCH 87/98] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../jax/csrc/extensions/inspect.cpp | 13 +++++-------- .../jax/debug/experimental/inspect.py | 18 ++++++++++++++---- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/transformer_engine/jax/csrc/extensions/inspect.cpp b/transformer_engine/jax/csrc/extensions/inspect.cpp index 6a03407bca..56ac585126 100644 --- a/transformer_engine/jax/csrc/extensions/inspect.cpp +++ b/transformer_engine/jax/csrc/extensions/inspect.cpp @@ -14,13 +14,9 @@ namespace transformer_engine { namespace jax { -Error_Type InspectFFI(cudaStream_t stream, - Buffer_Type input_buf, - Buffer_Type min_buf, - Buffer_Type max_buf, - Buffer_Type mean_buf, - Buffer_Type std_buf, - Result_Type output_buf) { +Error_Type InspectFFI(cudaStream_t stream, Buffer_Type input_buf, Buffer_Type min_buf, + Buffer_Type max_buf, Buffer_Type mean_buf, Buffer_Type std_buf, + Result_Type output_buf) { NVTE_CHECK(input_buf.untyped_data() != nullptr, "Input must be provided for inspect operation"); NVTE_CHECK(output_buf->untyped_data() != nullptr, "Output must be provided for inspect operation"); @@ -34,7 +30,8 @@ Error_Type InspectFFI(cudaStream_t stream, float min_val{}, max_val{}, mean_val{}, std_val{}; cudaMemcpyAsync(&min_val, min_buf.untyped_data(), sizeof(float), cudaMemcpyDeviceToHost, stream); cudaMemcpyAsync(&max_val, max_buf.untyped_data(), sizeof(float), cudaMemcpyDeviceToHost, stream); - cudaMemcpyAsync(&mean_val, mean_buf.untyped_data(), sizeof(float), cudaMemcpyDeviceToHost, stream); + cudaMemcpyAsync(&mean_val, mean_buf.untyped_data(), sizeof(float), cudaMemcpyDeviceToHost, + stream); cudaMemcpyAsync(&std_val, std_buf.untyped_data(), sizeof(float), cudaMemcpyDeviceToHost, stream); cudaStreamSynchronize(stream); diff --git a/transformer_engine/jax/debug/experimental/inspect.py b/transformer_engine/jax/debug/experimental/inspect.py index c87d34285b..59ec98fd8c 100644 --- a/transformer_engine/jax/debug/experimental/inspect.py +++ b/transformer_engine/jax/debug/experimental/inspect.py @@ -36,10 +36,18 @@ def abstract( """ inspect abstract """ - assert x_min_aval.shape == () and x_min_aval.dtype == jnp.float32, "x_min must be a scalar with dtype float32" - assert x_max_aval.shape == () and x_max_aval.dtype == jnp.float32, "x_max must be a scalar with dtype float32" - assert x_mean_aval.shape == () and x_mean_aval.dtype == jnp.float32, "x_mean must be a scalar with dtype float32" - assert x_std_aval.shape == () and x_std_aval.dtype == jnp.float32, "x_std must be a scalar with dtype float32" + assert ( + x_min_aval.shape == () and x_min_aval.dtype == jnp.float32 + ), "x_min must be a scalar with dtype float32" + assert ( + x_max_aval.shape == () and x_max_aval.dtype == jnp.float32 + ), "x_max must be a scalar with dtype float32" + assert ( + x_mean_aval.shape == () and x_mean_aval.dtype == jnp.float32 + ), "x_mean must be a scalar with dtype float32" + assert ( + x_std_aval.shape == () and x_std_aval.dtype == jnp.float32 + ), "x_std must be a scalar with dtype float32" return x_aval @staticmethod @@ -91,6 +99,7 @@ def impl( register_primitive(InspectPrimitive) + def _inspect_array_inner(x: jnp.ndarray) -> jnp.ndarray: return InspectPrimitive.outer_primitive.bind( x, @@ -100,6 +109,7 @@ def _inspect_array_inner(x: jnp.ndarray) -> jnp.ndarray: jnp.std(x.astype(jnp.float32)), ) + @partial(jax.custom_vjp, nondiff_argnums=()) def _inspect( x, From 378b4ecc1bea893797403f9f9a4307b67f9dfea7 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Mon, 9 Feb 2026 13:56:13 -0800 Subject: [PATCH 88/98] lint Signed-off-by: Jeremy Berchtold --- transformer_engine/jax/csrc/extensions/inspect.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformer_engine/jax/csrc/extensions/inspect.cpp b/transformer_engine/jax/csrc/extensions/inspect.cpp index 56ac585126..a7110367b3 100644 --- a/transformer_engine/jax/csrc/extensions/inspect.cpp +++ b/transformer_engine/jax/csrc/extensions/inspect.cpp @@ -72,7 +72,7 @@ Error_Type InspectFFI(cudaStream_t stream, Buffer_Type input_buf, Buffer_Type mi // Log the tensor metadata to the console printf("Tensor data written to %s (shape: [", filename.c_str()); for (size_t i = 0; i < input_buf.dimensions().size(); ++i) { - printf("%ld", static_cast(input_buf.dimensions()[i])); + printf("%zu", static_cast(input_buf.dimensions()[i])); if (i < input_buf.dimensions().size() - 1) { printf(", "); } From 49c3efe227c346290b94c3bb6f4f3f272ebbb458 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Tue, 10 Feb 2026 10:00:16 -0800 Subject: [PATCH 89/98] backup Ep=2 fsdp=2 works to train without nan with bf16 gmm with ring_of_experts=false --- tests/jax/test_custom_call_compute.py | 59 +++++++-------- .../common/gemm/cublaslt_grouped_gemm.cu | 5 +- .../jax/csrc/extensions/gemm.cpp | 72 +++++++++---------- transformer_engine/jax/dense.py | 4 +- transformer_engine/jax/flax/module.py | 2 +- 5 files changed, 67 insertions(+), 75 deletions(-) diff --git a/tests/jax/test_custom_call_compute.py b/tests/jax/test_custom_call_compute.py index cc8896da48..813560a191 100644 --- a/tests/jax/test_custom_call_compute.py +++ b/tests/jax/test_custom_call_compute.py @@ -1764,7 +1764,8 @@ def ref_func(x, gamma, kernel_1, kernel_2, bias_1, bias_2): # (5, 32, 128, 64), # Test the case where n_groups is not a multiple of 4 # (4, 16, 4, 4), # (3, 192, 64, 96), - (8, 16384, 14336, 4096), + # (8, 16384, 14336, 4096), + (8, 32768, 14336, 4096), # (8, 16384, 16384, 4096), # (8, 64, 32, 128), # (8, 64, 128, 256), @@ -1806,45 +1807,38 @@ def _ref_grouped_dense(self, lhs, rhs, bias, group_sizes, contracting_dims): return ref_out def _generate_grouped_dense_input(self, dtype, input_shape, data_layout="NN", with_bias=False): - # key = jax.random.PRNGKey(0) - # subkeys = jax.random.split(key, 4) - # n_groups, m, n, k = input_shape + key = jax.random.PRNGKey(0) + subkeys = jax.random.split(key, 4) + n_groups, m, n, k = input_shape + + GROUP_SIZE_USAGE_RATIO = 0.33 # m //= 32 - # group_sizes = jnp.sort(jax.random.randint(subkeys[0], (n_groups - 1,), 0, m)) - # group_sizes = jnp.concatenate([jnp.array([0]), group_sizes, jnp.array([m])]) - # group_sizes = jnp.diff(group_sizes) + group_sizes = jnp.sort(jax.random.randint(subkeys[0], (n_groups - 1,), 0, m)) + group_sizes = jnp.concatenate([jnp.array([0]), group_sizes, jnp.array([m])]) + group_sizes = jnp.diff(group_sizes) + + group_sizes = (group_sizes * GROUP_SIZE_USAGE_RATIO).astype(jnp.int32) - # # Make one empty input lhs to test empty GEMM handling - # group_sizes = group_sizes.at[0].set(group_sizes[0] + group_sizes[1]) - # group_sizes = group_sizes.at[1].set(0) + # Make one empty input lhs to test empty GEMM handling + group_sizes = group_sizes.at[0].set(group_sizes[0] + group_sizes[1]) + group_sizes = group_sizes.at[1].set(0) - # # *32 to make sure that input shape works for MXFP8 + # *32 to make sure that input shape works for MXFP8 # group_sizes = group_sizes * 32 # m = m * 32 - # # group_sizes = jnp.full((n_groups,), m // n_groups) + # group_sizes = jnp.full((n_groups,), m // n_groups) # assert group_sizes.sum() == m - # lhs_shape = (m if data_layout[0] == "N" else k, k if data_layout[0] == "N" else m) - # rhs_shape = (n_groups, k if data_layout[1] == "N" else n, n if data_layout[1] == "N" else k) - # bias_shape = (n_groups, n) + lhs_shape = (m if data_layout[0] == "N" else k, k if data_layout[0] == "N" else m) + rhs_shape = (n_groups, k if data_layout[1] == "N" else n, n if data_layout[1] == "N" else k) + bias_shape = (n_groups, n) - # lhs = jax.random.uniform(subkeys[1], lhs_shape, dtype=dtype) / jnp.sqrt(k) - # rhs = jax.random.uniform(subkeys[2], rhs_shape, dtype=dtype) / jnp.sqrt(k) - # # rhs = jnp.concatenate([i/n_groups*jnp.identity(k, dtype=dtype).reshape(1, k, k) for i in range(n_groups)], axis=0) - # bias = jax.random.uniform(subkeys[3], bias_shape, dtype=dtype) if with_bias else None - - def load_tensor(name: str): - import numpy as np - - tensor = np.load(f"/mnt/jberchtold/polyphe-lustre-home/maxtext/gemm_{name}.npy") - return jnp.array(tensor) - - lhs = load_tensor("lhs").astype(dtype) - rhs = load_tensor("rhs").astype(dtype) - bias = None - group_sizes = load_tensor("group_sizes").astype(jnp.int32) + lhs = jax.random.uniform(subkeys[1], lhs_shape, dtype=dtype) / jnp.sqrt(k) + rhs = jax.random.uniform(subkeys[2], rhs_shape, dtype=dtype) / jnp.sqrt(k) + # rhs = jnp.concatenate([i/n_groups*jnp.identity(k, dtype=dtype).reshape(1, k, k) for i in range(n_groups)], axis=0) + bias = jax.random.uniform(subkeys[3], bias_shape, dtype=dtype) if with_bias else None lhs_contracting_dim = (1,) if data_layout[0] == "N" else (0,) rhs_contracting_dim = (1,) if data_layout[1] == "N" else (2,) @@ -1898,7 +1892,8 @@ def _assert_grouped_gemm_output(self, out, group_sizes, ref, dtype): value_range=(jnp.min(ref), jnp.max(ref)), # value_range=(0, 0.5) ).save("output_diff.png") - assert_allclose(out, ref, dtype=jnp.float32) + assert_allclose(out, ref, dtype=dtype) + assert False # ref_list = jnp.split(ref_list, jnp.cumulative_sum(group_sizes)[:-1], axis=0) # out_list = jnp.split(out, jnp.cumulative_sum(group_sizes)[:-1], axis=0) # print([o.shape for o in out_list]) @@ -1912,7 +1907,7 @@ def _assert_grouped_gemm_output(self, out, group_sizes, ref, dtype): # ) @pytest_parametrize_wrapper("dtype", [jnp.bfloat16, jnp.float16]) - @pytest_parametrize_wrapper("layout", ["NT"]) + @pytest_parametrize_wrapper("layout", ["NN"]) def test_grouped_gemm_fp16(self, dtype, input_shape, layout): lhs, rhs, group_sizes, contracting_dims, _ = self._generate_grouped_dense_input( dtype, input_shape, layout diff --git a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu index b3e216dc4f..a2434419dc 100644 --- a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu +++ b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu @@ -487,9 +487,8 @@ __global__ void setup_grouped_gemm_kernel( a_cols[idx] = static_cast(a_first); b_rows[idx] = static_cast(b_last); b_cols[idx] = static_cast(b_first); - // For OUTPUTS (D, C): cuBLAS writes in column-major, so rows=first (M), cols=last (N). - d_rows[idx] = static_cast(d_first); - d_cols[idx] = static_cast(d_last); + d_rows[idx] = static_cast(d_last); + d_cols[idx] = static_cast(d_first); // Fill alpha/beta pointers (per-matrix) alpha_ptrs[idx] = alpha_ptr + idx; diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index 4c888ee0de..c6e5b111ad 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -703,6 +703,7 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type // } // printf("]\n"); + NVTE_CHECK(false, "Grouped dense wgrad is not supported in TE/JAX currently."); NVTE_CHECK(lhs_is_trans && !rhs_is_trans, "For grouped dense wgrad, only TN GEMM is supported in TE/JAX currently."); @@ -749,31 +750,23 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type //// RHS NVTEShape rhsShape{.data = {num_gemms * k, n}, .ndim = 2}; - // rhs_is_trans = true; - if (rhs_is_trans) { rhsShape.data[0] = num_gemms * n; rhsShape.data[1] = k; - // std::swap(rhsShape.data[0], rhsShape.data[1]); } - // NVTE_CHECK(!rhs_is_trans, "GroupedGemmFFI currently only supports rhs_is_trans=false"); auto rhs_tensor = make_grouped_tensor(rhs_data, rhs_sinv, scaling_mode, num_gemms, rhsShape); + NVTE_CHECK(!rhs_is_trans, "Transposed RHS is not supported."); + //// LHS NVTEShape lhsShape{.data = {m, k}, .ndim = 2}; - // NVTE_CHECK(lhs_is_trans, "GroupedGemmFFI currently only supports lhs_is_trans=true"); - // lhs_is_trans = true; - if (!lhs_is_trans) { + if (lhs_is_trans) { std::swap(lhsShape.data[0], lhsShape.data[1]); } - // if (!lhs_is_trans) { - // printf("GroupedGemmFFI: lhs_is_trans=false, m=%zu, k=%zu, n=%zu\n", m, k, n); - // cudaMemsetAsync(output->untyped_data(), 0, output->size_bytes(), stream); - // return ffi_with_cuda_error_check(); - // } + NVTE_CHECK(!lhs_is_trans, "Transposed LHS is not supported."); auto lhs_tensor = make_grouped_tensor(lhs_data, lhs_sinv, scaling_mode, num_gemms, lhsShape); lhs_tensor.set_group_info(group_sizes, group_offset_lhs, - lhs_is_trans ? kNVTEGroupedFirstDims : kNVTEGroupedLastDims); + lhs_is_trans ? kNVTEGroupedLastDims : kNVTEGroupedFirstDims); //// OUTPUT NVTEShape outShape{.data = {m, n}, .ndim = 2}; @@ -781,38 +774,41 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type num_gemms, outShape); out_tensor.set_group_info(group_sizes, group_offset_out, kNVTEGroupedFirstDims); - // printf("rhs_shape: [%zu, %zu], lhs_shape: [%zu, %zu], out_shape: [%zu, %zu]\n", - // rhsShape.data[0], rhsShape.data[1], - // lhsShape.data[0], lhsShape.data[1], - // outShape.data[0], outShape.data[1]); - - // printf("rhs_is_trans: %d, lhs_is_trans: %d\n", rhs_is_trans, lhs_is_trans); - // This memset is required because the group sizes may not fill the full buffer since we overallocate for the worst case. However, in theory unused space on the grouped axis should not be utilizied downstream, but it seems like somehow it is utilized. cudaMemsetAsync(output->untyped_data(), 0, output->size_bytes(), stream); + // std::vector debug_output(m * n * out_dtype_bytes, 0xFF); + // cudaMemcpyAsync(output->untyped_data(), debug_output.data(), m * n * out_dtype_bytes, + // cudaMemcpyHostToDevice, stream); std::vector host_group_sizes(num_gemms); cudaMemcpyAsync(host_group_sizes.data(), group_sizes.untyped_data(), num_gemms * sizeof(int32_t), cudaMemcpyDeviceToHost, stream); cudaStreamSynchronize(stream); - // int currentDevice; - // cudaGetDevice(¤tDevice); - // printf("[gpu=%d] Group sizes[total_group_size=%zu, m=%zu]: ", currentDevice, - // std::accumulate(host_group_sizes.begin(), host_group_sizes.end(), 0ULL), m); - // for (size_t i = 0; i < num_gemms; ++i) { - // printf("%d, ", host_group_sizes[i]); - // } - // printf("\n"); + int currentDevice; + cudaGetDevice(¤tDevice); + printf("[gpu=%d] Group sizes[total_group_size=%zu, m=%zu]: ", currentDevice, + std::accumulate(host_group_sizes.begin(), host_group_sizes.end(), 0ULL), m); + for (size_t i = 0; i < num_gemms; ++i) { + printf("%d, ", host_group_sizes[i]); + } + printf("\n"); + + nvte_grouped_gemm( + rhs_tensor, rhs_is_trans, + lhs_tensor, lhs_is_trans, + nullptr, out_tensor, + alpha_tensor.data(), beta_tensor.data(), workspace_setup.data(), + workspace_cublas.data(), + nullptr, // config (use defaults) + stream); + + // size_t _offset = + // std::accumulate(host_group_sizes.begin(), host_group_sizes.end(), 0ULL) * n * out_dtype_bytes; + // _offset = 0; + // cudaMemsetAsync(output->untyped_data() + _offset, 0, output->size_bytes() - _offset, stream); + // Why does zeroing the whole buffer here still produce NaNs? Is m, k, n not correctly mapping the shape of the tensor or does the grouped GEMM still overwrite beyond these buffers? - nvte_grouped_gemm(lhs_tensor, lhs_is_trans, rhs_tensor, rhs_is_trans, nullptr, out_tensor, - alpha_tensor.data(), beta_tensor.data(), workspace_setup.data(), - workspace_cublas.data(), - nullptr, // config (use defaults) - stream); - size_t _offset = - std::accumulate(host_group_sizes.begin(), host_group_sizes.end(), 0ULL) * n * out_dtype_bytes; - cudaMemsetAsync(output->untyped_data() + _offset, 0, output->size_bytes() - _offset, stream); // std::vector<__bf16> debug_output(m * n); // cudaMemcpyAsync(debug_output.data(), output->untyped_data(), m * n * out_dtype_bytes, @@ -826,7 +822,7 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type // size_t index = i_m * n + i_n; // if (isnan(static_cast(debug_output[index])) || // isinf(static_cast(debug_output[index]))) { - // printf("[gpu=%d] Output contains NaN or Inf at index [%zu, %zu] (flat index %zu)\n", i_m, + // printf("[gpu=%d] Output contains NaN or Inf at index [%zu, %zu] (flat index %zu)\n", currentDevice, i_m, // i_n, index); // totalPrints++; // if (totalPrints >= MAX_PRINTS) { @@ -839,6 +835,8 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type // } // } + cudaStreamSynchronize(stream); + return ffi_with_cuda_error_check(); } diff --git a/transformer_engine/jax/dense.py b/transformer_engine/jax/dense.py index e163f8dcd2..b438a435ae 100644 --- a/transformer_engine/jax/dense.py +++ b/transformer_engine/jax/dense.py @@ -667,8 +667,8 @@ def _grouped_dense_bwd_rule( dkernel_amax = None # HACK - # dgrad = jnp.zeros_like(dgrad) - # wgrad = jnp.zeros_like(wgrad) + dgrad = jnp.zeros_like(dgrad) + wgrad = jnp.zeros_like(wgrad) return dgrad, wgrad, group_sizes_grad, dbias, dkernel_amax, quantizer_set diff --git a/transformer_engine/jax/flax/module.py b/transformer_engine/jax/flax/module.py index 09e57fe94a..7941c509c5 100644 --- a/transformer_engine/jax/flax/module.py +++ b/transformer_engine/jax/flax/module.py @@ -1549,7 +1549,7 @@ def te_grouped_dot_general(generate_quantizer_set, x, kernel, group_sizes, **kwa kernel, group_sizes=group_sizes, contracting_dims=((1,), (1,)), - quantizer_set=quantizer_set + # quantizer_set=quantizer_set ) return out.reshape(target_out_shape) From 7747a178320752132c637227ee274b0b9babd165 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Tue, 10 Feb 2026 10:18:36 -0800 Subject: [PATCH 90/98] dgrad working with ring of experts = false --- transformer_engine/jax/csrc/extensions/gemm.cpp | 3 --- transformer_engine/jax/dense.py | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index c6e5b111ad..2dd0dca470 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -756,14 +756,11 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type } auto rhs_tensor = make_grouped_tensor(rhs_data, rhs_sinv, scaling_mode, num_gemms, rhsShape); - NVTE_CHECK(!rhs_is_trans, "Transposed RHS is not supported."); - //// LHS NVTEShape lhsShape{.data = {m, k}, .ndim = 2}; if (lhs_is_trans) { std::swap(lhsShape.data[0], lhsShape.data[1]); } - NVTE_CHECK(!lhs_is_trans, "Transposed LHS is not supported."); auto lhs_tensor = make_grouped_tensor(lhs_data, lhs_sinv, scaling_mode, num_gemms, lhsShape); lhs_tensor.set_group_info(group_sizes, group_offset_lhs, lhs_is_trans ? kNVTEGroupedLastDims : kNVTEGroupedFirstDims); diff --git a/transformer_engine/jax/dense.py b/transformer_engine/jax/dense.py index b438a435ae..27a985e329 100644 --- a/transformer_engine/jax/dense.py +++ b/transformer_engine/jax/dense.py @@ -667,7 +667,7 @@ def _grouped_dense_bwd_rule( dkernel_amax = None # HACK - dgrad = jnp.zeros_like(dgrad) + # dgrad = jnp.zeros_like(dgrad) wgrad = jnp.zeros_like(wgrad) return dgrad, wgrad, group_sizes_grad, dbias, dkernel_amax, quantizer_set From df44dc75ba5cf819d5d63c5b5ff1f049ec08bcca Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Tue, 10 Feb 2026 10:24:55 -0800 Subject: [PATCH 91/98] wgrad also works with ep=2 fsdp=2 ring_of_experts=false and trains without producing nan --- transformer_engine/jax/csrc/extensions/gemm.cpp | 15 ++++++++------- transformer_engine/jax/dense.py | 2 -- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index 2dd0dca470..f01e05630c 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -703,13 +703,11 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type // } // printf("]\n"); - NVTE_CHECK(false, "Grouped dense wgrad is not supported in TE/JAX currently."); NVTE_CHECK(lhs_is_trans && !rhs_is_trans, "For grouped dense wgrad, only TN GEMM is supported in TE/JAX currently."); //// RHS NVTEShape rhsShape{.data = {k, n}, .ndim = 2}; - // rhs_is_trans = true; auto rhs_tensor = make_grouped_tensor(rhs_data, rhs_sinv, scaling_mode, num_gemms, rhsShape); rhs_tensor.set_group_info(group_sizes, group_offset_out, kNVTEGroupedFirstDims); @@ -735,11 +733,14 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type // TODO(jberchtold): make this memset smaller by only zeroing the expert weights that correspond to groups with size zero. cudaMemsetAsync(output->untyped_data(), 0, output->size_bytes(), stream); - nvte_grouped_gemm(lhs_tensor, lhs_is_trans, rhs_tensor, rhs_is_trans, nullptr, out_tensor, - alpha_tensor.data(), beta_tensor.data(), workspace_setup.data(), - workspace_cublas.data(), - nullptr, // config (use defaults) - stream); + nvte_grouped_gemm( + rhs_tensor, rhs_is_trans, + lhs_tensor, lhs_is_trans, + nullptr, out_tensor, + alpha_tensor.data(), beta_tensor.data(), workspace_setup.data(), + workspace_cublas.data(), + nullptr, // config (use defaults) + stream); cudaStreamSynchronize(stream); diff --git a/transformer_engine/jax/dense.py b/transformer_engine/jax/dense.py index 27a985e329..c1d1fb0fb9 100644 --- a/transformer_engine/jax/dense.py +++ b/transformer_engine/jax/dense.py @@ -667,8 +667,6 @@ def _grouped_dense_bwd_rule( dkernel_amax = None # HACK - # dgrad = jnp.zeros_like(dgrad) - wgrad = jnp.zeros_like(wgrad) return dgrad, wgrad, group_sizes_grad, dbias, dkernel_amax, quantizer_set From 55ffc615920b41a894f566dbc37c975783013caf Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Tue, 10 Feb 2026 15:07:53 -0800 Subject: [PATCH 92/98] remove debugging code --- .../jax/csrc/extensions/gemm.cpp | 77 +------------------ 1 file changed, 4 insertions(+), 73 deletions(-) diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index f01e05630c..226d6bfeeb 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -686,23 +686,7 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type std::vector{num_gemms}, convert_ffi_datatype_to_te_dtype(beta.element_type())); - // printf("Num gemms: %zu, M: %zu, N: %zu, K: %zu, group_sizes: %zu, lhs_is_trans: %d, rhs_is_trans: %d, is_grouped_dense_wgrad: %d\n", num_gemms, m, n, k, group_sizes.dimensions()[0], lhs_is_trans, rhs_is_trans, is_grouped_dense_wgrad); - if (is_grouped_dense_wgrad) { - // printf("GroupedGemmFFI: (lhs_is_trans=%d, rhs_is_trans=%d) m=%zu, k=%zu, n=%zu, rhs_shape=[", lhs_is_trans, rhs_is_trans, m, k, n); - // for (auto dim : rhs_data.dimensions()) { - // printf("%zu, ", dim); - // } - // printf("], lhs_shape=["); - // for (auto dim : lhs_data.dimensions()) { - // printf("%zu, ", dim); - // } - // printf("], out_shape=["); - // for (auto dim : output->dimensions()) { - // printf("%zu, ", dim); - // } - // printf("]\n"); - NVTE_CHECK(lhs_is_trans && !rhs_is_trans, "For grouped dense wgrad, only TN GEMM is supported in TE/JAX currently."); @@ -722,13 +706,6 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type auto out_tensor = make_grouped_tensor(*output, std::nullopt, JAXX_Scaling_Mode::NO_SCALING, num_gemms, outShape); - // printf("rhs_shape: [%zu, %zu], lhs_shape: [%zu, %zu], out_shape: [%zu, %zu]\n", - // rhsShape.data[0], rhsShape.data[1], - // lhsShape.data[0], lhsShape.data[1], - // outShape.data[0], outShape.data[1]); - - // printf("rhs_is_trans: %d, lhs_is_trans: %d\n", rhs_is_trans, lhs_is_trans); - // Output needs to be zeroed in case any group sizes have size zero, meaning the expert weight isn't used in the fwd, meaning the corresponding output gradient should be zero. But using the grouped GEMM, the output buffer contains uninitialized data. // TODO(jberchtold): make this memset smaller by only zeroing the expert weights that correspond to groups with size zero. cudaMemsetAsync(output->untyped_data(), 0, output->size_bytes(), stream); @@ -742,8 +719,6 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type nullptr, // config (use defaults) stream); - cudaStreamSynchronize(stream); - return ffi_with_cuda_error_check(); } @@ -774,23 +749,6 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type // This memset is required because the group sizes may not fill the full buffer since we overallocate for the worst case. However, in theory unused space on the grouped axis should not be utilizied downstream, but it seems like somehow it is utilized. cudaMemsetAsync(output->untyped_data(), 0, output->size_bytes(), stream); - // std::vector debug_output(m * n * out_dtype_bytes, 0xFF); - // cudaMemcpyAsync(output->untyped_data(), debug_output.data(), m * n * out_dtype_bytes, - // cudaMemcpyHostToDevice, stream); - - std::vector host_group_sizes(num_gemms); - cudaMemcpyAsync(host_group_sizes.data(), group_sizes.untyped_data(), num_gemms * sizeof(int32_t), - cudaMemcpyDeviceToHost, stream); - cudaStreamSynchronize(stream); - - int currentDevice; - cudaGetDevice(¤tDevice); - printf("[gpu=%d] Group sizes[total_group_size=%zu, m=%zu]: ", currentDevice, - std::accumulate(host_group_sizes.begin(), host_group_sizes.end(), 0ULL), m); - for (size_t i = 0; i < num_gemms; ++i) { - printf("%d, ", host_group_sizes[i]); - } - printf("\n"); nvte_grouped_gemm( rhs_tensor, rhs_is_trans, @@ -801,39 +759,12 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type nullptr, // config (use defaults) stream); - // size_t _offset = - // std::accumulate(host_group_sizes.begin(), host_group_sizes.end(), 0ULL) * n * out_dtype_bytes; - // _offset = 0; - // cudaMemsetAsync(output->untyped_data() + _offset, 0, output->size_bytes() - _offset, stream); - // Why does zeroing the whole buffer here still produce NaNs? Is m, k, n not correctly mapping the shape of the tensor or does the grouped GEMM still overwrite beyond these buffers? - - - // std::vector<__bf16> debug_output(m * n); - // cudaMemcpyAsync(debug_output.data(), output->untyped_data(), m * n * out_dtype_bytes, - // cudaMemcpyDeviceToHost, stream); + // std::vector host_group_sizes(num_gemms); + // cudaMemcpyAsync(host_group_sizes.data(), group_sizes.untyped_data(), num_gemms * sizeof(int32_t), + // cudaMemcpyDeviceToHost, stream); // cudaStreamSynchronize(stream); - // size_t totalPrints = 0; - // constexpr size_t MAX_PRINTS = 1; - // for (size_t i_m = 0; i_m < m; i_m++) { - // for (size_t i_n = 0; i_n < n; i_n++) { - // size_t index = i_m * n + i_n; - // if (isnan(static_cast(debug_output[index])) || - // isinf(static_cast(debug_output[index]))) { - // printf("[gpu=%d] Output contains NaN or Inf at index [%zu, %zu] (flat index %zu)\n", currentDevice, i_m, - // i_n, index); - // totalPrints++; - // if (totalPrints >= MAX_PRINTS) { - // break; - // } - // } - // } - // if (totalPrints >= MAX_PRINTS) { - // break; - // } - // } - - cudaStreamSynchronize(stream); + // cudaMemsetAsync(output->untyped_data(), 0, output->size_bytes(), stream); return ffi_with_cuda_error_check(); } From 0774f33651a1477bd4b403f587918a22e9b13ee3 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Tue, 10 Feb 2026 16:14:35 -0800 Subject: [PATCH 93/98] debug changes --- .../jax/csrc/extensions/inspect.cpp | 10 +++++----- .../jax/debug/experimental/__init__.py | 3 ++- .../jax/debug/experimental/inspect.py | 20 +++++++++++++++++++ transformer_engine/jax/flax/module.py | 7 +------ 4 files changed, 28 insertions(+), 12 deletions(-) diff --git a/transformer_engine/jax/csrc/extensions/inspect.cpp b/transformer_engine/jax/csrc/extensions/inspect.cpp index a7110367b3..af22d4d17b 100644 --- a/transformer_engine/jax/csrc/extensions/inspect.cpp +++ b/transformer_engine/jax/csrc/extensions/inspect.cpp @@ -41,11 +41,11 @@ Error_Type InspectFFI(cudaStream_t stream, Buffer_Type input_buf, Buffer_Type mi // Write the tensor data to a file as a binary blob std::string filename = "my_tensor_gpu" + std::to_string(device) + ".bin"; - std::ofstream file(filename, std::ios::binary); - if (file.is_open()) { - file.write(reinterpret_cast(input_data.data()), input_data.size()); - file.close(); - } + // std::ofstream file(filename, std::ios::binary); + // if (file.is_open()) { + // file.write(reinterpret_cast(input_data.data()), input_data.size()); + // file.close(); + // } // Write out a metadata file std::string meta_filename = "my_tensor_gpu" + std::to_string(device) + "_meta.json"; diff --git a/transformer_engine/jax/debug/experimental/__init__.py b/transformer_engine/jax/debug/experimental/__init__.py index 44a4847660..551fa95f6e 100644 --- a/transformer_engine/jax/debug/experimental/__init__.py +++ b/transformer_engine/jax/debug/experimental/__init__.py @@ -6,9 +6,10 @@ This API is experimental and may change or be removed without deprecation in future releases. """ -from .inspect import inspect_array, load_array_dump +from .inspect import compare, inspect_array, load_array_dump __all__ = [ + "compare", "inspect_array", "load_array_dump", ] diff --git a/transformer_engine/jax/debug/experimental/inspect.py b/transformer_engine/jax/debug/experimental/inspect.py index 59ec98fd8c..fc883f742a 100644 --- a/transformer_engine/jax/debug/experimental/inspect.py +++ b/transformer_engine/jax/debug/experimental/inspect.py @@ -152,6 +152,26 @@ def inspect_array(x: jnp.ndarray, name: str) -> jnp.ndarray: # TODO: Handle the name of the tensor in the primitive and output files return _inspect(x) +def compare(a: jnp.ndarray, b: jnp.ndarray, name: str) -> jnp.ndarray: + """Utility function to compare two JAX arrays and print their differences. + + Args: + a (jnp.ndarray): The first JAX array to compare. + b (jnp.ndarray): The second JAX array to compare. + name (str): The name of the comparison for identification in the output. + + Returns: + jnp.ndarray: The first input array `a`, returned unchanged. + """ + # a, b = b, a + + diff = a-b + jax.debug.print("Comparing arrays {name}: min={min}, max={max}, mean={mean}, std={std}", name=name, min=jnp.min(diff), max=jnp.max(diff), mean=jnp.mean(diff), std=jnp.std(diff)) + + return a + + out_f32 = inspect_array(a.astype(jnp.float32) - b.astype(jnp.float32), name) + b.astype(jnp.float32) + return out_f32.astype(a.dtype) def load_array_dump(filename: str, shape: tuple, dtype: jnp.dtype) -> jnp.ndarray: """Utility function to load a JAX array from a dumped binary file. diff --git a/transformer_engine/jax/flax/module.py b/transformer_engine/jax/flax/module.py index 7941c509c5..d5cf1ec8cf 100644 --- a/transformer_engine/jax/flax/module.py +++ b/transformer_engine/jax/flax/module.py @@ -1536,14 +1536,10 @@ def reorder_rhs_for_grouped_gemm(tensor, bdims, cdims): def make_ragged_dot_cls(quantization_recipe): - import jax - def te_grouped_dot_general(generate_quantizer_set, x, kernel, group_sizes, **kwargs): num_groups = group_sizes.shape[0] quantizer_set = generate_quantizer_set(n_groups=num_groups) - target_out_shape = jax.lax.ragged_dot(x, kernel, group_sizes=group_sizes).shape - out = grouped_dense( x, kernel, @@ -1551,8 +1547,7 @@ def te_grouped_dot_general(generate_quantizer_set, x, kernel, group_sizes, **kwa contracting_dims=((1,), (1,)), # quantizer_set=quantizer_set ) - - return out.reshape(target_out_shape) + return out return wrap_function_in_te_state_module( te_grouped_dot_general, quantization_recipe, "ragged_dot" From 697daba28912984a171abd6afe1159d6c41197b0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 11 Feb 2026 00:18:36 +0000 Subject: [PATCH 94/98] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../jax/csrc/extensions/gemm.cpp | 26 +++++++------------ .../jax/debug/experimental/inspect.py | 17 +++++++++--- 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index 226d6bfeeb..520d1bea9d 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -710,14 +710,11 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type // TODO(jberchtold): make this memset smaller by only zeroing the expert weights that correspond to groups with size zero. cudaMemsetAsync(output->untyped_data(), 0, output->size_bytes(), stream); - nvte_grouped_gemm( - rhs_tensor, rhs_is_trans, - lhs_tensor, lhs_is_trans, - nullptr, out_tensor, - alpha_tensor.data(), beta_tensor.data(), workspace_setup.data(), - workspace_cublas.data(), - nullptr, // config (use defaults) - stream); + nvte_grouped_gemm(rhs_tensor, rhs_is_trans, lhs_tensor, lhs_is_trans, nullptr, out_tensor, + alpha_tensor.data(), beta_tensor.data(), workspace_setup.data(), + workspace_cublas.data(), + nullptr, // config (use defaults) + stream); return ffi_with_cuda_error_check(); } @@ -750,14 +747,11 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type // This memset is required because the group sizes may not fill the full buffer since we overallocate for the worst case. However, in theory unused space on the grouped axis should not be utilizied downstream, but it seems like somehow it is utilized. cudaMemsetAsync(output->untyped_data(), 0, output->size_bytes(), stream); - nvte_grouped_gemm( - rhs_tensor, rhs_is_trans, - lhs_tensor, lhs_is_trans, - nullptr, out_tensor, - alpha_tensor.data(), beta_tensor.data(), workspace_setup.data(), - workspace_cublas.data(), - nullptr, // config (use defaults) - stream); + nvte_grouped_gemm(rhs_tensor, rhs_is_trans, lhs_tensor, lhs_is_trans, nullptr, out_tensor, + alpha_tensor.data(), beta_tensor.data(), workspace_setup.data(), + workspace_cublas.data(), + nullptr, // config (use defaults) + stream); // std::vector host_group_sizes(num_gemms); // cudaMemcpyAsync(host_group_sizes.data(), group_sizes.untyped_data(), num_gemms * sizeof(int32_t), diff --git a/transformer_engine/jax/debug/experimental/inspect.py b/transformer_engine/jax/debug/experimental/inspect.py index fc883f742a..a82b60d9db 100644 --- a/transformer_engine/jax/debug/experimental/inspect.py +++ b/transformer_engine/jax/debug/experimental/inspect.py @@ -152,6 +152,7 @@ def inspect_array(x: jnp.ndarray, name: str) -> jnp.ndarray: # TODO: Handle the name of the tensor in the primitive and output files return _inspect(x) + def compare(a: jnp.ndarray, b: jnp.ndarray, name: str) -> jnp.ndarray: """Utility function to compare two JAX arrays and print their differences. @@ -165,14 +166,24 @@ def compare(a: jnp.ndarray, b: jnp.ndarray, name: str) -> jnp.ndarray: """ # a, b = b, a - diff = a-b - jax.debug.print("Comparing arrays {name}: min={min}, max={max}, mean={mean}, std={std}", name=name, min=jnp.min(diff), max=jnp.max(diff), mean=jnp.mean(diff), std=jnp.std(diff)) + diff = a - b + jax.debug.print( + "Comparing arrays {name}: min={min}, max={max}, mean={mean}, std={std}", + name=name, + min=jnp.min(diff), + max=jnp.max(diff), + mean=jnp.mean(diff), + std=jnp.std(diff), + ) return a - out_f32 = inspect_array(a.astype(jnp.float32) - b.astype(jnp.float32), name) + b.astype(jnp.float32) + out_f32 = inspect_array(a.astype(jnp.float32) - b.astype(jnp.float32), name) + b.astype( + jnp.float32 + ) return out_f32.astype(a.dtype) + def load_array_dump(filename: str, shape: tuple, dtype: jnp.dtype) -> jnp.ndarray: """Utility function to load a JAX array from a dumped binary file. From 6d576a91a2264c1af77fb84b07d5f30196ef8e3e Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Wed, 11 Feb 2026 16:32:45 -0800 Subject: [PATCH 95/98] Revert "JAX C++ extension RTC" This reverts commit e43ba1f864222e879feed5a71d910099513ceeaf. --- setup.py | 20 +- transformer_engine/jax/__init__.py | 4 - transformer_engine/jax/rtc/__init__.py | 1 - transformer_engine/jax/rtc/rtc.py | 163 ----------- transformer_engine/jax/rtc/utils.py | 372 ------------------------- transformer_engine/jax/setup.py | 6 +- 6 files changed, 13 insertions(+), 553 deletions(-) delete mode 100644 transformer_engine/jax/rtc/__init__.py delete mode 100644 transformer_engine/jax/rtc/rtc.py delete mode 100644 transformer_engine/jax/rtc/utils.py diff --git a/setup.py b/setup.py index f3bff3efce..18bb736f24 100644 --- a/setup.py +++ b/setup.py @@ -223,16 +223,16 @@ def git_check_submodules() -> None: current_file_path / "transformer_engine", ) ) - # if "jax" in frameworks: - # from build_tools.jax import setup_jax_extension - - # ext_modules.append( - # setup_jax_extension( - # "transformer_engine/jax/csrc", - # current_file_path / "transformer_engine" / "jax" / "csrc", - # current_file_path / "transformer_engine", - # ) - # ) + if "jax" in frameworks: + from build_tools.jax import setup_jax_extension + + ext_modules.append( + setup_jax_extension( + "transformer_engine/jax/csrc", + current_file_path / "transformer_engine" / "jax" / "csrc", + current_file_path / "transformer_engine", + ) + ) # Configure package setuptools.setup( diff --git a/transformer_engine/jax/__init__.py b/transformer_engine/jax/__init__.py index 805b46fc39..d0afc1ff25 100644 --- a/transformer_engine/jax/__init__.py +++ b/transformer_engine/jax/__init__.py @@ -29,10 +29,6 @@ from transformer_engine.common import load_framework_extension -from . import rtc - -print("Compiling JAX RTC extension...") -rtc.compile_extension() load_framework_extension("jax") from . import flax diff --git a/transformer_engine/jax/rtc/__init__.py b/transformer_engine/jax/rtc/__init__.py deleted file mode 100644 index 62d9367ccc..0000000000 --- a/transformer_engine/jax/rtc/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .rtc import compile_extension diff --git a/transformer_engine/jax/rtc/rtc.py b/transformer_engine/jax/rtc/rtc.py deleted file mode 100644 index b1666118e5..0000000000 --- a/transformer_engine/jax/rtc/rtc.py +++ /dev/null @@ -1,163 +0,0 @@ -# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# See LICENSE for license information. - -"""JAX related extensions.""" -import os -from pathlib import Path -from packaging import version - -import setuptools - -from .utils import get_cuda_include_dirs, all_files_in_dir, debug_build_enabled -from typing import List - - -def install_requirements() -> List[str]: - """Install dependencies for TE/JAX extensions.""" - return ["jax", "flax>=0.7.1"] - - -def test_requirements() -> List[str]: - """Test dependencies for TE/JAX extensions. - - Triton Package Selection: - The triton package is selected based on NVTE_USE_PYTORCH_TRITON environment variable: - - Default (NVTE_USE_PYTORCH_TRITON unset or "0"): - Returns 'triton' - OpenAI's standard package from PyPI. - Install with: pip install triton - - NVTE_USE_PYTORCH_TRITON=1: - Returns 'pytorch-triton' - for mixed JAX+PyTorch environments. - Install with: pip install pytorch-triton --index-url https://download.pytorch.org/whl/cu121 - - Note: Do NOT install pytorch-triton from PyPI directly - that's a placeholder. - """ - use_pytorch_triton = bool(int(os.environ.get("NVTE_USE_PYTORCH_TRITON", "0"))) - - triton_package = "pytorch-triton" if use_pytorch_triton else "triton" - - return [ - "numpy", - triton_package, - ] - - -def xla_path() -> str: - """XLA root path lookup. - Throws FileNotFoundError if XLA source is not found.""" - - try: - import jax - - if version.parse(jax.__version__) >= version.parse("0.5.0"): - from jax import ffi # pylint: disable=ungrouped-imports - else: - from jax.extend import ffi # pylint: disable=ungrouped-imports - - except ImportError: - if os.getenv("XLA_HOME"): - xla_home = Path(os.getenv("XLA_HOME")) - else: - xla_home = "/opt/xla" - else: - xla_home = ffi.include_dir() - - if not os.path.isdir(xla_home): - raise FileNotFoundError("Could not find xla source.") - return xla_home - - -def setup_jax_extension( - csrc_source_files, - csrc_header_files, - common_header_files, -) -> setuptools.Extension: - """Setup PyBind11 extension for JAX support""" - # Source files - csrc_source_files = Path(csrc_source_files) - extensions_dir = csrc_source_files / "extensions" - sources = all_files_in_dir(extensions_dir, name_extension="cpp") - - # Header files - include_dirs = get_cuda_include_dirs() - include_dirs.extend( - [ - common_header_files, - common_header_files / "common", - common_header_files / "common" / "include", - csrc_header_files, - xla_path(), - ] - ) - print("Includ dirs for JAX extension:", include_dirs) - - # Compile flags - cxx_flags = ["-O3"] - if debug_build_enabled(): - cxx_flags.append("-g") - cxx_flags.append("-UNDEBUG") - else: - cxx_flags.append("-g0") - - # Define TE/JAX as a Pybind11Extension - from pybind11.setup_helpers import Pybind11Extension - - return Pybind11Extension( - "transformer_engine_jax", - sources=[str(path) for path in sources], - include_dirs=[str(path) for path in include_dirs], - extra_compile_args=cxx_flags, - libraries=["nccl"], - ) - - -_compiled = False - - -def compile_extension(): - import os - import shutil - - global _compiled - if _compiled: - return - - base_dir = Path(os.path.dirname(__file__)).parent.parent.parent - te_jax_build_dir = base_dir / "build" / "te_jax" - # if os.path.exists(te_jax_build_dir): - # shutil.rmtree(te_jax_build_dir) - - ext = setup_jax_extension( - Path(__file__).resolve().parent.parent / "csrc", - Path(__file__).resolve().parent.parent / "csrc", - Path(__file__).resolve().parent.parent.parent, - ) - from pybind11.setup_helpers import build_ext as BuildExtension - from setuptools import Distribution - import subprocess - - dist = Distribution() - dist.ext_modules = [ext] - cmd = BuildExtension(dist) - cmd.initialize_options() - cmd.parallel = os.cpu_count() # Enable parallel compilation - cmd.finalize_options() - cmd.build_temp = os.path.join(te_jax_build_dir, "temp") - cmd.build_lib = os.path.join(te_jax_build_dir, "lib") - os.makedirs(cmd.build_temp, exist_ok=True) - os.makedirs(cmd.build_lib, exist_ok=True) - cmd.run() - - subprocess.call( - [ - "cp", - os.path.join( - cmd.build_lib, "transformer_engine_jax" + cmd.get_ext_filename(fullname="") - ), - base_dir, - ] - ) - - _compiled = True diff --git a/transformer_engine/jax/rtc/utils.py b/transformer_engine/jax/rtc/utils.py deleted file mode 100644 index 8a52440310..0000000000 --- a/transformer_engine/jax/rtc/utils.py +++ /dev/null @@ -1,372 +0,0 @@ -# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# See LICENSE for license information. - -"""Installation script.""" - -import functools -import glob -import importlib -import os -import re -import shutil -import subprocess -import sys -import platform -from pathlib import Path -from importlib.metadata import version as get_version -from subprocess import CalledProcessError -from typing import List, Optional, Tuple, Union - - -# Needs to stay consistent with .pre-commit-config.yaml config. -def min_python_version() -> Tuple[int]: - """Minimum supported Python version.""" - return (3, 10, 0) - - -def min_python_version_str() -> str: - """String representing minimum supported Python version.""" - return ".".join(map(str, min_python_version())) - - -if sys.version_info < min_python_version(): - raise RuntimeError( - f"Transformer Engine requires Python {min_python_version_str()} or newer, " - f"but found Python {platform.python_version()}." - ) - - -@functools.lru_cache(maxsize=None) -def debug_build_enabled() -> bool: - """Whether to build with a debug configuration""" - return bool(int(os.getenv("NVTE_BUILD_DEBUG", "0"))) - - -@functools.lru_cache(maxsize=None) -def get_max_jobs_for_parallel_build() -> int: - """Number of parallel jobs for Nina build""" - - # Default: maximum parallel jobs - num_jobs = 0 - - # Check environment variable - if os.getenv("NVTE_BUILD_MAX_JOBS"): - num_jobs = int(os.getenv("NVTE_BUILD_MAX_JOBS")) - elif os.getenv("MAX_JOBS"): - num_jobs = int(os.getenv("MAX_JOBS")) - - # Check command-line arguments - for arg in sys.argv.copy(): - if arg.startswith("--parallel="): - num_jobs = int(arg.replace("--parallel=", "")) - sys.argv.remove(arg) - - return num_jobs - - -def all_files_in_dir(path, name_extension=None): - all_files = [] - for dirname, _, names in os.walk(path): - for name in names: - if name_extension is not None and not name.endswith(f".{name_extension}"): - continue - all_files.append(Path(dirname, name)) - return all_files - - -def remove_dups(_list: List): - return list(set(_list)) - - -def found_cmake() -> bool: - """ "Check if valid CMake is available - - CMake 3.18 or newer is required. - - """ - - # Check if CMake is available - try: - _cmake_bin = cmake_bin() - except FileNotFoundError: - return False - - # Query CMake for version info - output = subprocess.run( - [_cmake_bin, "--version"], - capture_output=True, - check=True, - universal_newlines=True, - ) - match = re.search(r"version\s*([\d.]+)", output.stdout) - version = match.group(1).split(".") - version = tuple(int(v) for v in version) - return version >= (3, 18) - - -def cmake_bin() -> Path: - """Get CMake executable - - Throws FileNotFoundError if not found. - - """ - - # Search in CMake Python package - _cmake_bin: Optional[Path] = None - try: - from cmake import CMAKE_BIN_DIR - except ImportError: - pass - else: - _cmake_bin = Path(CMAKE_BIN_DIR).resolve() / "cmake" - if not _cmake_bin.is_file(): - _cmake_bin = None - - # Search in path - if _cmake_bin is None: - _cmake_bin = shutil.which("cmake") - if _cmake_bin is not None: - _cmake_bin = Path(_cmake_bin).resolve() - - # Return executable if found - if _cmake_bin is None: - raise FileNotFoundError("Could not find CMake executable") - return _cmake_bin - - -def found_ninja() -> bool: - """ "Check if Ninja is available""" - return shutil.which("ninja") is not None - - -def found_pybind11() -> bool: - """ "Check if pybind11 is available""" - - # Check if Python package is installed - try: - import pybind11 - except ImportError: - pass - else: - return True - - # Check if CMake can find pybind11 - if not found_cmake(): - return False - try: - subprocess.run( - [ - "cmake", - "--find-package", - "-DMODE=EXIST", - "-DNAME=pybind11", - "-DCOMPILER_ID=CXX", - "-DLANGUAGE=CXX", - ], - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - check=True, - ) - except (CalledProcessError, OSError): - pass - else: - return True - return False - - -@functools.lru_cache(maxsize=None) -def cuda_toolkit_include_path() -> Tuple[str, str]: - """Returns root path for cuda toolkit includes. - - return `None` if CUDA is not found.""" - # Try finding CUDA - cuda_home: Optional[Path] = None - if cuda_home is None and os.getenv("CUDA_HOME"): - # Check in CUDA_HOME - cuda_home = Path(os.getenv("CUDA_HOME")) / "include" - if cuda_home is None: - # Check in NVCC - nvcc_bin = shutil.which("nvcc") - if nvcc_bin is not None: - cuda_home = Path(nvcc_bin.rstrip("/bin/nvcc")) / "include" - if cuda_home is None: - # Last-ditch guess in /usr/local/cuda - if Path("/usr/local/cuda").is_dir(): - cuda_home = Path("/usr/local/cuda") / "include" - return cuda_home - - -@functools.lru_cache(maxsize=None) -def nvcc_path() -> Tuple[str, str]: - """Returns the NVCC binary path. - - Throws FileNotFoundError if NVCC is not found.""" - # Try finding NVCC - nvcc_bin: Optional[Path] = None - if nvcc_bin is None and os.getenv("CUDA_HOME"): - # Check in CUDA_HOME - cuda_home = Path(os.getenv("CUDA_HOME")) - nvcc_bin = cuda_home / "bin" / "nvcc" - if nvcc_bin is None: - # Check if nvcc is in path - nvcc_bin = shutil.which("nvcc") - if nvcc_bin is not None: - cuda_home = Path(nvcc_bin.rstrip("/bin/nvcc")) - nvcc_bin = Path(nvcc_bin) - if nvcc_bin is None: - # Last-ditch guess in /usr/local/cuda - cuda_home = Path("/usr/local/cuda") - nvcc_bin = cuda_home / "bin" / "nvcc" - if not nvcc_bin.is_file(): - raise FileNotFoundError(f"Could not find NVCC at {nvcc_bin}") - - return nvcc_bin - - -@functools.lru_cache(maxsize=None) -def get_cuda_include_dirs() -> Tuple[str, str]: - """Returns the CUDA header directory.""" - - # If cuda is installed via toolkit, all necessary headers - # are bundled inside the top level cuda directory. - if cuda_toolkit_include_path() is not None: - return [cuda_toolkit_include_path()] - - # Use pip wheels to include all headers. - try: - import nvidia - except ModuleNotFoundError as e: - raise RuntimeError("CUDA not found.") - - cuda_root = Path(nvidia.__file__).parent - return [ - subdir / "include" - for subdir in cuda_root.iterdir() - if subdir.is_dir() and (subdir / "include").is_dir() - ] - - -@functools.lru_cache(maxsize=None) -def cuda_archs() -> str: - archs = os.getenv("NVTE_CUDA_ARCHS") - if archs is None: - version = cuda_version() - if version >= (13, 0): - archs = "75;80;89;90;100;120" - elif version >= (12, 8): - archs = "70;80;89;90;100;120" - else: - archs = "70;80;89;90" - return archs - - -def cuda_version() -> Tuple[int, ...]: - """CUDA Toolkit version as a (major, minor) tuple. - - Try to get cuda version by locating the nvcc executable and running nvcc --version. If - nvcc is not found, look for the cuda runtime package pip `nvidia-cuda-runtime-cu12` - and check pip version. - """ - - try: - nvcc_bin = nvcc_path() - except FileNotFoundError as e: - pass - else: - output = subprocess.run( - [nvcc_bin, "-V"], - capture_output=True, - check=True, - universal_newlines=True, - ) - match = re.search(r"release\s*([\d.]+)", output.stdout) - version = match.group(1).split(".") - return tuple(int(v) for v in version) - - try: - version_str = get_version("nvidia-cuda-runtime-cu12") - version_tuple = tuple(int(part) for part in version_str.split(".") if part.isdigit()) - return version_tuple - except importlib.metadata.PackageNotFoundError: - raise RuntimeError("Could neither find NVCC executable nor CUDA runtime Python package.") - - -def get_frameworks() -> List[str]: - """DL frameworks to build support for""" - _frameworks: List[str] = [] - supported_frameworks = ["pytorch", "jax"] - - # Check environment variable - if os.getenv("NVTE_FRAMEWORK"): - _frameworks.extend(os.getenv("NVTE_FRAMEWORK").split(",")) - - # Check command-line arguments - for arg in sys.argv.copy(): - if arg.startswith("--framework="): - _frameworks.extend(arg.replace("--framework=", "").split(",")) - sys.argv.remove(arg) - - # Detect installed frameworks if not explicitly specified - if not _frameworks: - try: - import torch - except ImportError: - pass - else: - _frameworks.append("pytorch") - try: - import jax - except ImportError: - pass - else: - _frameworks.append("jax") - - # Special framework names - if "all" in _frameworks: - _frameworks = supported_frameworks.copy() - if "none" in _frameworks: - _frameworks = [] - - # Check that frameworks are valid - _frameworks = [framework.lower() for framework in _frameworks] - for framework in _frameworks: - if framework not in supported_frameworks: - raise ValueError(f"Transformer Engine does not support framework={framework}") - - return _frameworks - - -def copy_common_headers( - src_dir: Union[Path, str], - dst_dir: Union[Path, str], -) -> None: - """Copy headers from core library - - src_dir should be the transformer_engine directory within the root - Transformer Engine repository. All .h and .cuh files within - transformer_engine/common are copied into dst_dir. Relative paths - are preserved. - - """ - - # Find common header files in src dir - headers = glob.glob( - os.path.join(str(src_dir), "common", "**", "*.h"), - recursive=True, - ) - headers.extend( - glob.glob( - os.path.join(str(src_dir), "common", "**", "*.cuh"), - recursive=True, - ) - ) - headers = [Path(path) for path in headers] - - # Copy common header files to dst dir - src_dir = Path(src_dir) - dst_dir = Path(dst_dir) - for path in headers: - new_path = dst_dir / path.relative_to(src_dir) - new_path.parent.mkdir(exist_ok=True, parents=True) - shutil.copy(path, new_path) diff --git a/transformer_engine/jax/setup.py b/transformer_engine/jax/setup.py index f0a304c1c2..2d25242825 100644 --- a/transformer_engine/jax/setup.py +++ b/transformer_engine/jax/setup.py @@ -108,9 +108,9 @@ def get_cuda_major_version() -> int: common_headers_dir = "common_headers" copy_common_headers(current_file_path.parent, str(current_file_path / common_headers_dir)) ext_modules = [ - # setup_jax_extension( - # "csrc", current_file_path / "csrc", current_file_path / common_headers_dir - # ) + setup_jax_extension( + "csrc", current_file_path / "csrc", current_file_path / common_headers_dir + ) ] # Setup version and requirements. From a93d053dab3f17f2500a6295dc18f18eaa8a30c5 Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Thu, 12 Feb 2026 14:03:52 -0800 Subject: [PATCH 96/98] comparison VJP --- .../jax/debug/experimental/__init__.py | 3 +- .../jax/debug/experimental/inspect.py | 63 ++++++++++++++++++- 2 files changed, 63 insertions(+), 3 deletions(-) diff --git a/transformer_engine/jax/debug/experimental/__init__.py b/transformer_engine/jax/debug/experimental/__init__.py index 551fa95f6e..4a480c7d15 100644 --- a/transformer_engine/jax/debug/experimental/__init__.py +++ b/transformer_engine/jax/debug/experimental/__init__.py @@ -6,10 +6,11 @@ This API is experimental and may change or be removed without deprecation in future releases. """ -from .inspect import compare, inspect_array, load_array_dump +from .inspect import compare, compare_vjp, inspect_array, load_array_dump __all__ = [ "compare", + "compare_vjp", "inspect_array", "load_array_dump", ] diff --git a/transformer_engine/jax/debug/experimental/inspect.py b/transformer_engine/jax/debug/experimental/inspect.py index a82b60d9db..2af7973db8 100644 --- a/transformer_engine/jax/debug/experimental/inspect.py +++ b/transformer_engine/jax/debug/experimental/inspect.py @@ -11,7 +11,7 @@ from transformer_engine.jax.cpp_extensions.base import BasePrimitive, register_primitive -__all__ = ["inspect_array", "load_array_dump"] +__all__ = ["compare", "compare_vjp", "inspect_array", "load_array_dump"] class InspectPrimitive(BasePrimitive): @@ -152,7 +152,6 @@ def inspect_array(x: jnp.ndarray, name: str) -> jnp.ndarray: # TODO: Handle the name of the tensor in the primitive and output files return _inspect(x) - def compare(a: jnp.ndarray, b: jnp.ndarray, name: str) -> jnp.ndarray: """Utility function to compare two JAX arrays and print their differences. @@ -183,6 +182,66 @@ def compare(a: jnp.ndarray, b: jnp.ndarray, name: str) -> jnp.ndarray: ) return out_f32.astype(a.dtype) +def compare_vjp(f1: callable, f2: callable, name: str) -> callable: + """Utility function to compare the outputs of two functions and in the forward and backward passes. + + Handles non-differentiable arguments (e.g., integer arrays) gracefully by + detecting float0 gradients and passing them through without comparison. + + Args: + f1 (callable): The first function to compare. + f2 (callable): The second function to compare. + name (str): The name of the comparison for identification in the output. + + Returns: + callable: A new function that compares the outputs of `f1` and `f2` when called and returns the result of `f1`. + """ + + @jax.custom_vjp + def _f(*args): + return _f_fwd_rule(*args)[0] + + def _f_fwd_rule(*args): + out1, f1_vjp_func = jax.vjp(f1, *args) + out2, f2_vjp_func = jax.vjp(f2, *args) + out = compare(out1, out2, name + "_fwd") + return out, (f1_vjp_func, f2_vjp_func) + + def _has_float0(x): + """Check if a pytree leaf or structure contains float0 dtypes.""" + leaves = jax.tree_util.tree_leaves(x) + return any( + hasattr(leaf, "dtype") and leaf.dtype == jax.dtypes.float0 + for leaf in leaves + ) + + def _f_bwd_rule(res, g): + f1_vjp_func, f2_vjp_func = res + f1_grads = f1_vjp_func(g) + f2_grads = f2_vjp_func(g) + out_grads = [] + for i, (g1, g2) in enumerate(zip(f1_grads, f2_grads)): + # Integer/non-differentiable arguments produce float0 gradients + # which don't support arithmetic. Pass them through without comparison. + if _has_float0(g1): + out_grads.append(g1) + elif isinstance(g1, jnp.ndarray): + out_grads.append(compare(g1, g2, name + f"_grad_{i}")) + else: + # g1 is a pytree of arrays — compare leaf by leaf + g1_flat, tree_def = jax.tree_util.tree_flatten(g1) + g2_flat, _ = jax.tree_util.tree_flatten(g2) + compared = [ + compare(a, b, name + f"_grad_{i}_{j}") + for j, (a, b) in enumerate(zip(g1_flat, g2_flat)) + ] + out_grads.append(jax.tree_util.tree_unflatten(tree_def, compared)) + return tuple(out_grads) + + _f.defvjp(_f_fwd_rule, _f_bwd_rule) + + return _f + def load_array_dump(filename: str, shape: tuple, dtype: jnp.dtype) -> jnp.ndarray: """Utility function to load a JAX array from a dumped binary file. From 8a3ec7f911ee8171482327a5ed4d1ca94a330fae Mon Sep 17 00:00:00 2001 From: Jeremy Berchtold Date: Thu, 12 Feb 2026 17:05:07 -0800 Subject: [PATCH 97/98] fixed wgrad --- transformer_engine/jax/cpp_extensions/gemm.py | 2 +- .../jax/csrc/extensions/gemm.cpp | 20 +++++- .../jax/debug/experimental/inspect.py | 69 ++++++++++++++++++- 3 files changed, 86 insertions(+), 5 deletions(-) diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py index 73b0475bef..29d80fdaa0 100644 --- a/transformer_engine/jax/cpp_extensions/gemm.py +++ b/transformer_engine/jax/cpp_extensions/gemm.py @@ -2152,7 +2152,7 @@ def grouped_gemm( ) # Offset is by number of elements total, not number of rows # HACK: this _out is really the rhs in this case group_offset_out = ( - group_offset * 1 + group_offset * N ) # Offset is by number of elements total, not number of rows else: group_offset_lhs = ( diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp index 520d1bea9d..7487972210 100644 --- a/transformer_engine/jax/csrc/extensions/gemm.cpp +++ b/transformer_engine/jax/csrc/extensions/gemm.cpp @@ -696,10 +696,26 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type rhs_tensor.set_group_info(group_sizes, group_offset_out, kNVTEGroupedFirstDims); //// LHS - NVTEShape lhsShape{.data = {m, k}, .ndim = 2}; + NVTEShape lhsShape{.data = {k, m}, .ndim = 2}; lhs_is_trans = true; auto lhs_tensor = make_grouped_tensor(lhs_data, lhs_sinv, scaling_mode, num_gemms, lhsShape); - lhs_tensor.set_group_info(group_sizes, group_offset_lhs, kNVTEGroupedLastDims); + lhs_tensor.set_group_info(group_sizes, group_offset_lhs, kNVTEGroupedFirstDims); + + printf("LHS shape: "); + for (auto dim : lhs_data.dimensions()) { + printf("%zu, ", dim); + } + printf("\n"); + printf("RHS shape: "); + for (auto dim : rhs_data.dimensions()) { + printf("%zu, ", dim); + } + printf("\n"); + printf("Output shape: "); + for (auto dim : output->dimensions()) { + printf("%zu, ", dim); + } + printf("\n"); //// OUTPUT NVTEShape outShape{.data = {num_gemms * m, n}, .ndim = 2}; diff --git a/transformer_engine/jax/debug/experimental/inspect.py b/transformer_engine/jax/debug/experimental/inspect.py index 2af7973db8..bec474e299 100644 --- a/transformer_engine/jax/debug/experimental/inspect.py +++ b/transformer_engine/jax/debug/experimental/inspect.py @@ -182,6 +182,66 @@ def compare(a: jnp.ndarray, b: jnp.ndarray, name: str) -> jnp.ndarray: ) return out_f32.astype(a.dtype) +def _tensor_to_image(tensor, value_range=None): + import numpy as np + from PIL import Image + + # Convert to numpy + tensor_np = jnp.array(tensor, dtype=jnp.float32) + + # Replace NaNs with a large value for visualization + tensor_np = jnp.where(jnp.isnan(tensor_np), 5000, tensor_np) + + # Determine normalization range + if value_range is None: + min_val = tensor_np.min() + max_val = tensor_np.max() + else: + min_val, max_val = value_range + + # Normalize to 0-255 range for visualization + range_val = max_val - min_val + 1e-8 + normalized = jnp.clip((tensor_np - min_val) / range_val * 255, 0, 255) + + # Downsample by averaging 4x4 blocks + h, w = normalized.shape + new_h, new_w = h // 4, w // 4 + normalized = normalized[: new_h * 4, : new_w * 4] # Trim to multiple of 4 + normalized = normalized.reshape(new_h, 4, new_w, 4).mean(axis=(1, 3)) + normalized = np.array(normalized) + normalized_uint8 = normalized.astype(np.uint8) + + # Create grayscale image + img = Image.fromarray(normalized_uint8, mode="L") + return img + +_count = 0 +def _tensor_diff_to_image(out, ref): + import os + import math + + os.makedirs("debug_outputs", exist_ok=True) + + global _count + + if _count > 50: + return + + out = out.reshape((math.prod(out.shape[:-1]), out.shape[-1])).astype(jnp.float32) + ref = ref.reshape((math.prod(ref.shape[:-1]), ref.shape[-1])).astype(jnp.float32) + + _tensor_to_image(out, value_range=(jnp.min(ref), jnp.max(ref))).save(f"debug_outputs/output_te_{_count}.png") + _tensor_to_image(ref, value_range=(jnp.min(ref), jnp.max(ref))).save(f"debug_outputs/output_ref_{_count}.png") + diff = jnp.abs(out.astype(jnp.float32) - ref.astype(jnp.float32)) + _tensor_to_image( + diff, + value_range=(jnp.min(diff), jnp.max(diff)), + # value_range=(jnp.min(ref), jnp.max(ref)), + # value_range=(0, 0.5) + ).save(f"debug_outputs/output_diff_{_count}.png") + + _count += 1 + def compare_vjp(f1: callable, f2: callable, name: str) -> callable: """Utility function to compare the outputs of two functions and in the forward and backward passes. @@ -205,7 +265,7 @@ def _f_fwd_rule(*args): out1, f1_vjp_func = jax.vjp(f1, *args) out2, f2_vjp_func = jax.vjp(f2, *args) out = compare(out1, out2, name + "_fwd") - return out, (f1_vjp_func, f2_vjp_func) + return out, (f1_vjp_func, f2_vjp_func, args[2]) def _has_float0(x): """Check if a pytree leaf or structure contains float0 dtypes.""" @@ -216,16 +276,21 @@ def _has_float0(x): ) def _f_bwd_rule(res, g): - f1_vjp_func, f2_vjp_func = res + f1_vjp_func, f2_vjp_func, group_sizes = res f1_grads = f1_vjp_func(g) f2_grads = f2_vjp_func(g) out_grads = [] + jax.debug.print("Group sizes: {}", group_sizes) for i, (g1, g2) in enumerate(zip(f1_grads, f2_grads)): # Integer/non-differentiable arguments produce float0 gradients # which don't support arithmetic. Pass them through without comparison. if _has_float0(g1): out_grads.append(g1) elif isinstance(g1, jnp.ndarray): + # jax.debug.print("F1 {name}: min={min}, max={max}, mean={mean}, std={std}", name=name + f"_grad_{i}", min=jnp.min(g1), max=jnp.max(g1), mean=jnp.mean(g1), std=jnp.std(g1)) + # jax.debug.print("F2 {name}: min={min}, max={max}, mean={mean}, std={std}", name=name + f"_grad_{i}", min=jnp.min(g2), max=jnp.max(g2), mean=jnp.mean(g2), std=jnp.std(g2)) + # if i == 1: # wgrad + # jax.debug.callback(_tensor_diff_to_image, g1, g2) out_grads.append(compare(g1, g2, name + f"_grad_{i}")) else: # g1 is a pytree of arrays — compare leaf by leaf From f20e99d77228bee992c300411484c15ad4653c49 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 13 Feb 2026 01:06:50 +0000 Subject: [PATCH 98/98] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../jax/debug/experimental/inspect.py | 27 ++++++++++++------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/transformer_engine/jax/debug/experimental/inspect.py b/transformer_engine/jax/debug/experimental/inspect.py index bec474e299..86cbccdf17 100644 --- a/transformer_engine/jax/debug/experimental/inspect.py +++ b/transformer_engine/jax/debug/experimental/inspect.py @@ -152,6 +152,7 @@ def inspect_array(x: jnp.ndarray, name: str) -> jnp.ndarray: # TODO: Handle the name of the tensor in the primitive and output files return _inspect(x) + def compare(a: jnp.ndarray, b: jnp.ndarray, name: str) -> jnp.ndarray: """Utility function to compare two JAX arrays and print their differences. @@ -182,6 +183,7 @@ def compare(a: jnp.ndarray, b: jnp.ndarray, name: str) -> jnp.ndarray: ) return out_f32.astype(a.dtype) + def _tensor_to_image(tensor, value_range=None): import numpy as np from PIL import Image @@ -215,7 +217,10 @@ def _tensor_to_image(tensor, value_range=None): img = Image.fromarray(normalized_uint8, mode="L") return img + _count = 0 + + def _tensor_diff_to_image(out, ref): import os import math @@ -226,12 +231,16 @@ def _tensor_diff_to_image(out, ref): if _count > 50: return - + out = out.reshape((math.prod(out.shape[:-1]), out.shape[-1])).astype(jnp.float32) ref = ref.reshape((math.prod(ref.shape[:-1]), ref.shape[-1])).astype(jnp.float32) - _tensor_to_image(out, value_range=(jnp.min(ref), jnp.max(ref))).save(f"debug_outputs/output_te_{_count}.png") - _tensor_to_image(ref, value_range=(jnp.min(ref), jnp.max(ref))).save(f"debug_outputs/output_ref_{_count}.png") + _tensor_to_image(out, value_range=(jnp.min(ref), jnp.max(ref))).save( + f"debug_outputs/output_te_{_count}.png" + ) + _tensor_to_image(ref, value_range=(jnp.min(ref), jnp.max(ref))).save( + f"debug_outputs/output_ref_{_count}.png" + ) diff = jnp.abs(out.astype(jnp.float32) - ref.astype(jnp.float32)) _tensor_to_image( diff, @@ -242,6 +251,7 @@ def _tensor_diff_to_image(out, ref): _count += 1 + def compare_vjp(f1: callable, f2: callable, name: str) -> callable: """Utility function to compare the outputs of two functions and in the forward and backward passes. @@ -260,20 +270,17 @@ def compare_vjp(f1: callable, f2: callable, name: str) -> callable: @jax.custom_vjp def _f(*args): return _f_fwd_rule(*args)[0] - + def _f_fwd_rule(*args): out1, f1_vjp_func = jax.vjp(f1, *args) out2, f2_vjp_func = jax.vjp(f2, *args) out = compare(out1, out2, name + "_fwd") return out, (f1_vjp_func, f2_vjp_func, args[2]) - + def _has_float0(x): """Check if a pytree leaf or structure contains float0 dtypes.""" leaves = jax.tree_util.tree_leaves(x) - return any( - hasattr(leaf, "dtype") and leaf.dtype == jax.dtypes.float0 - for leaf in leaves - ) + return any(hasattr(leaf, "dtype") and leaf.dtype == jax.dtypes.float0 for leaf in leaves) def _f_bwd_rule(res, g): f1_vjp_func, f2_vjp_func, group_sizes = res @@ -302,7 +309,7 @@ def _f_bwd_rule(res, g): ] out_grads.append(jax.tree_util.tree_unflatten(tree_def, compared)) return tuple(out_grads) - + _f.defvjp(_f_fwd_rule, _f_bwd_rule) return _f