From bd99b95993600cdde01cc3fc4d02fce110559535 Mon Sep 17 00:00:00 2001 From: vince62s Date: Fri, 7 Jun 2024 13:11:29 +0200 Subject: [PATCH 1/3] rename num_kv remove multiquery --- eole/bin/convert/convert_HF.py | 27 +++++--------- eole/config/models.py | 30 +++++++-------- eole/modules/multi_headed_attn.py | 62 ++++++++++++------------------- 3 files changed, 47 insertions(+), 72 deletions(-) diff --git a/eole/bin/convert/convert_HF.py b/eole/bin/convert/convert_HF.py index 088a2d173..643455083 100755 --- a/eole/bin/convert/convert_HF.py +++ b/eole/bin/convert/convert_HF.py @@ -337,25 +337,19 @@ def run(cls, args): mlp_activation_fn = act_table[arch] layer_norm = ln_table[arch] - multiquery = False - if "multi_query" in config.keys(): - multiquery = config["multi_query"] - num_kv = 1 + if "multi_query" in config.keys() and config["multi_query"]: + heads_kv = 1 # might be usefull for old config elif ( "num_key_value_heads" in config.keys() and config["num_key_value_heads"] != heads ): - num_kv = config["num_key_value_heads"] + heads_kv = config["num_key_value_heads"] elif "num_kv_heads" in config.keys() and config["num_kv_heads"] != heads: - num_kv = config["num_kv_heads"] + heads_kv = config["num_kv_heads"] elif "n_head_kv" in config.keys() and config["n_head_kv"] != heads: - num_kv = config["n_head_kv"] + heads_kv = config["n_head_kv"] else: - num_kv = 0 - if num_kv is None: - num_kv = 0 - - shared_layer = num_kv == 1 + heads_kv = heads if "parallel_attn" in config.keys(): parallel_residual = config["parallel_attn"] @@ -453,7 +447,7 @@ def run(cls, args): rotary_interleave = False if arch == "PhiForCausalLM": parallel_residual = True - shared_layer = True + shared_layer_norm = True add_qkvbias = True add_ffnbias = True rotary_interleave = False @@ -627,7 +621,7 @@ def get_weight(checkpoint, tensor_name): + param ] = w - if shared_layer: + if shared_layer_norm: idx = 0 else: idx = 1 @@ -857,10 +851,9 @@ def get_weight(checkpoint, tensor_name): rotary_theta=rope_theta, rotary_dim=rotary_dim, sliding_window=sliding_window, - multiquery=multiquery, - num_kv=num_kv, + heads_kv=heads_kv, parallel_residual=parallel_residual, - shared_layer_norm=shared_layer, + shared_layer_norm=shared_layer_norm, add_qkvbias=add_qkvbias, add_ffnbias=add_ffnbias, num_experts=num_experts, diff --git a/eole/config/models.py b/eole/config/models.py index 667c7916b..0f4f3de50 100644 --- a/eole/config/models.py +++ b/eole/config/models.py @@ -7,10 +7,10 @@ class EmbeddingsConfig(Config): src_word_vec_size: int = Field( - default=500, description="Word embedding size for src." + default=512, description="Word embedding size for src." ) tgt_word_vec_size: int = Field( - default=500, description="Word embedding size for tgt." + default=512, description="Word embedding size for tgt." ) word_vec_size: int = Field( default=-1, description="Word embedding size for src and tgt." @@ -40,12 +40,12 @@ class EncoderConfig(Config): default="rnn", description="Type of encoder layer(s) to use." ) layers: int = Field(default=2, description="Number of layers in the encoder.") - hidden_size: int = Field(default=500, description="Size of encoder hidden states.") + hidden_size: int = Field(default=512, description="Size of encoder hidden states.") # This field should be set at EmbeddingsConfig level but will be copied here for cases # where input size to the rnn is different to the hidden size src_word_vec_size: int = Field( - default=500, description="Word embedding size for src." + default=512, description="Word embedding size for src." ) @@ -56,12 +56,12 @@ class DecoderConfig(Config): default="rnn", description="Type of decoder layer(s) to use." ) layers: int = Field(default=2, description="Number of layers in the decoder.") - hidden_size: int = Field(default=500, description="Size of decoder hidden states.") + hidden_size: int = Field(default=512, description="Size of decoder hidden states.") # This field should be set at EmbeddingsConfig level but will be copied here for cases # where input size to the rnn is different to the hidden size tgt_word_vec_size: int = Field( - default=500, description="Word embedding size for tgt." + default=512, description="Word embedding size for tgt." ) coverage_attn: bool = Field( default=False, description="Train a coverage attention layer." @@ -197,13 +197,9 @@ class TransformerConfig(Config): description="Add bias to nn.Linear of Query/Key/Value in MHA. " "Note: this will add bias to output projection layer too.", ) - multiquery: bool = Field( - default=False, - description="Use MultiQuery attention (https://arxiv.org/pdf/1911.02150.pdf)", - ) - num_kv: int = Field( - default=0, - description="Number of heads for KV in the variant of MultiQuery attention " + heads_kv: int | None = Field( + default=None, + description="Number of heads for KV. heads_kv=heads if None, else number of heads for KV" "(e.g. Falcon 40B)", ) add_ffnbias: bool = Field( @@ -277,9 +273,11 @@ def _validate_transformer_decoder_config(self): # ) # ) - # multiquery is mostly a decoder thing, but we should make this cleaner at some point - if self.multiquery and self.num_kv == 0: - self.num_kv = 1 + assert ( + self.hidden_size % self.heads == 0 + ), "Transformer Model dimension {} must be divisible by the number of heads {}".format( + self.hidden_size, self.heads + ) return self diff --git a/eole/modules/multi_headed_attn.py b/eole/modules/multi_headed_attn.py index 6c4f291be..45d78192d 100644 --- a/eole/modules/multi_headed_attn.py +++ b/eole/modules/multi_headed_attn.py @@ -270,49 +270,33 @@ def __init__( is_decoder: bool = True, attn_type: str = None, ) -> None: - assert ( - model_config.hidden_size % model_config.heads == 0 - ), "Model dimension must be divisible by the number of heads" self.dim_per_head = model_config.hidden_size // model_config.heads super(MultiHeadedAttention, self).__init__() self.heads = model_config.heads - self.num_kv = model_config.num_kv + self.heads_kv = ( + model_config.heads_kv + if model_config.heads_kv is not None + else model_config.heads + ) self.parallel_gpu = running_config.parallel_gpu - if model_config.num_kv == 0: - assert ( - model_config.hidden_size % self.parallel_gpu == 0 - ), "Model dimension must be divisible by the number of partitions" - self.linear_keys = skip_init( - nn.Linear, - in_features=model_config.hidden_size, - out_features=model_config.hidden_size // self.parallel_gpu, - bias=model_config.add_qkvbias, - ) - self.linear_values = skip_init( - nn.Linear, - in_features=model_config.hidden_size, - out_features=model_config.hidden_size // self.parallel_gpu, - bias=model_config.add_qkvbias, - ) - else: - assert ( - self.dim_per_head * self.num_kv - ) % self.parallel_gpu == 0, ( - "Model dimension must be divisible by the number of partitions" - ) - self.linear_keys = skip_init( - nn.Linear, - in_features=model_config.hidden_size, - out_features=self.dim_per_head * self.num_kv // self.parallel_gpu, - bias=model_config.add_qkvbias, - ) - self.linear_values = skip_init( - nn.Linear, - in_features=model_config.hidden_size, - out_features=self.dim_per_head * self.num_kv // self.parallel_gpu, - bias=model_config.add_qkvbias, - ) + assert ( + self.dim_per_head * self.heads_kv + ) % self.parallel_gpu == 0, ( + "Model dimension must be divisible by the number of partitions" + ) + self.linear_keys = skip_init( + nn.Linear, + in_features=model_config.hidden_size, + out_features=self.dim_per_head * self.heads_kv // self.parallel_gpu, + bias=model_config.add_qkvbias, + ) + self.linear_values = skip_init( + nn.Linear, + in_features=model_config.hidden_size, + out_features=self.dim_per_head * self.heads_kv // self.parallel_gpu, + bias=model_config.add_qkvbias, + ) self.linear_query = skip_init( nn.Linear, in_features=model_config.hidden_size, @@ -601,7 +585,7 @@ def forward( ) b, h, l, d = key.size() - if self.num_kv > 0: + if self.heads_kv < self.heads: qh = query.size(1) # expand key on heads dimension when it's less than query heads (multi-query variant) key = key.view(b, -1, 1, l, d).repeat(1, 1, qh // h, 1, 1) From 6d58cd537ba1cc7cef18eb95d260f17abb68f848 Mon Sep 17 00:00:00 2001 From: vince62s Date: Fri, 7 Jun 2024 13:37:45 +0200 Subject: [PATCH 2/3] flake --- eole/bin/convert/convert_HF.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/eole/bin/convert/convert_HF.py b/eole/bin/convert/convert_HF.py index 643455083..3dcaa9265 100755 --- a/eole/bin/convert/convert_HF.py +++ b/eole/bin/convert/convert_HF.py @@ -609,8 +609,6 @@ def get_weight(checkpoint, tensor_name): + param, ) - if num_kv == 0: - num_kv = heads if w is not None: if type(source) == tuple: w = eval("w" + srcmap) From bf7dfdd7c17108bf387b4184914ec67b4134b7a8 Mon Sep 17 00:00:00 2001 From: vince62s Date: Sat, 8 Jun 2024 10:06:44 +0200 Subject: [PATCH 3/3] fix --- eole/bin/convert/convert_HF.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/eole/bin/convert/convert_HF.py b/eole/bin/convert/convert_HF.py index 3dcaa9265..7ccc6f67c 100755 --- a/eole/bin/convert/convert_HF.py +++ b/eole/bin/convert/convert_HF.py @@ -445,6 +445,8 @@ def run(cls, args): add_qkvbias = False add_ffnbias = False rotary_interleave = False + shared_layer_norm = False + if arch == "PhiForCausalLM": parallel_residual = True shared_layer_norm = True