From bd99b95993600cdde01cc3fc4d02fce110559535 Mon Sep 17 00:00:00 2001
From: vince62s <vince62s@yahoo.com>
Date: Fri, 7 Jun 2024 13:11:29 +0200
Subject: [PATCH 1/3] rename num_kv remove multiquery

---
 eole/bin/convert/convert_HF.py    | 27 +++++---------
 eole/config/models.py             | 30 +++++++--------
 eole/modules/multi_headed_attn.py | 62 ++++++++++++-------------------
 3 files changed, 47 insertions(+), 72 deletions(-)

diff --git a/eole/bin/convert/convert_HF.py b/eole/bin/convert/convert_HF.py
index 088a2d173..643455083 100755
--- a/eole/bin/convert/convert_HF.py
+++ b/eole/bin/convert/convert_HF.py
@@ -337,25 +337,19 @@ def run(cls, args):
         mlp_activation_fn = act_table[arch]
         layer_norm = ln_table[arch]
 
-        multiquery = False
-        if "multi_query" in config.keys():
-            multiquery = config["multi_query"]
-            num_kv = 1
+        if "multi_query" in config.keys() and config["multi_query"]:
+            heads_kv = 1  # might be usefull for old config
         elif (
             "num_key_value_heads" in config.keys()
             and config["num_key_value_heads"] != heads
         ):
-            num_kv = config["num_key_value_heads"]
+            heads_kv = config["num_key_value_heads"]
         elif "num_kv_heads" in config.keys() and config["num_kv_heads"] != heads:
-            num_kv = config["num_kv_heads"]
+            heads_kv = config["num_kv_heads"]
         elif "n_head_kv" in config.keys() and config["n_head_kv"] != heads:
-            num_kv = config["n_head_kv"]
+            heads_kv = config["n_head_kv"]
         else:
-            num_kv = 0
-        if num_kv is None:
-            num_kv = 0
-
-        shared_layer = num_kv == 1
+            heads_kv = heads
 
         if "parallel_attn" in config.keys():
             parallel_residual = config["parallel_attn"]
@@ -453,7 +447,7 @@ def run(cls, args):
         rotary_interleave = False
         if arch == "PhiForCausalLM":
             parallel_residual = True
-            shared_layer = True
+            shared_layer_norm = True
             add_qkvbias = True
             add_ffnbias = True
             rotary_interleave = False
@@ -627,7 +621,7 @@ def get_weight(checkpoint, tensor_name):
                                         + param
                                     ] = w
 
-                    if shared_layer:
+                    if shared_layer_norm:
                         idx = 0
                     else:
                         idx = 1
@@ -857,10 +851,9 @@ def get_weight(checkpoint, tensor_name):
                 rotary_theta=rope_theta,
                 rotary_dim=rotary_dim,
                 sliding_window=sliding_window,
-                multiquery=multiquery,
-                num_kv=num_kv,
+                heads_kv=heads_kv,
                 parallel_residual=parallel_residual,
-                shared_layer_norm=shared_layer,
+                shared_layer_norm=shared_layer_norm,
                 add_qkvbias=add_qkvbias,
                 add_ffnbias=add_ffnbias,
                 num_experts=num_experts,
diff --git a/eole/config/models.py b/eole/config/models.py
index 667c7916b..0f4f3de50 100644
--- a/eole/config/models.py
+++ b/eole/config/models.py
@@ -7,10 +7,10 @@
 
 class EmbeddingsConfig(Config):
     src_word_vec_size: int = Field(
-        default=500, description="Word embedding size for src."
+        default=512, description="Word embedding size for src."
     )
     tgt_word_vec_size: int = Field(
-        default=500, description="Word embedding size for tgt."
+        default=512, description="Word embedding size for tgt."
     )
     word_vec_size: int = Field(
         default=-1, description="Word embedding size for src and tgt."
@@ -40,12 +40,12 @@ class EncoderConfig(Config):
         default="rnn", description="Type of encoder layer(s) to use."
     )
     layers: int = Field(default=2, description="Number of layers in the encoder.")
-    hidden_size: int = Field(default=500, description="Size of encoder hidden states.")
+    hidden_size: int = Field(default=512, description="Size of encoder hidden states.")
 
     # This field should be set at EmbeddingsConfig level but will be copied here for cases
     # where input size to the rnn is different to the hidden size
     src_word_vec_size: int = Field(
-        default=500, description="Word embedding size for src."
+        default=512, description="Word embedding size for src."
     )
 
 
@@ -56,12 +56,12 @@ class DecoderConfig(Config):
         default="rnn", description="Type of decoder layer(s) to use."
     )
     layers: int = Field(default=2, description="Number of layers in the decoder.")
-    hidden_size: int = Field(default=500, description="Size of decoder hidden states.")
+    hidden_size: int = Field(default=512, description="Size of decoder hidden states.")
 
     # This field should be set at EmbeddingsConfig level but will be copied here for cases
     # where input size to the rnn is different to the hidden size
     tgt_word_vec_size: int = Field(
-        default=500, description="Word embedding size for tgt."
+        default=512, description="Word embedding size for tgt."
     )
     coverage_attn: bool = Field(
         default=False, description="Train a coverage attention layer."
@@ -197,13 +197,9 @@ class TransformerConfig(Config):
         description="Add bias to nn.Linear of Query/Key/Value in MHA. "
         "Note: this will add bias to output projection layer too.",
     )
-    multiquery: bool = Field(
-        default=False,
-        description="Use MultiQuery attention (https://arxiv.org/pdf/1911.02150.pdf)",
-    )
-    num_kv: int = Field(
-        default=0,
-        description="Number of heads for KV in the variant of MultiQuery attention "
+    heads_kv: int | None = Field(
+        default=None,
+        description="Number of heads for KV. heads_kv=heads if None, else number of heads for KV"
         "(e.g. Falcon 40B)",
     )
     add_ffnbias: bool = Field(
@@ -277,9 +273,11 @@ def _validate_transformer_decoder_config(self):
         #         )
         #     )
 
-        # multiquery is mostly a decoder thing, but we should make this cleaner at some point
-        if self.multiquery and self.num_kv == 0:
-            self.num_kv = 1
+        assert (
+            self.hidden_size % self.heads == 0
+        ), "Transformer Model dimension {} must be divisible by the number of heads {}".format(
+            self.hidden_size, self.heads
+        )
         return self
 
 
diff --git a/eole/modules/multi_headed_attn.py b/eole/modules/multi_headed_attn.py
index 6c4f291be..45d78192d 100644
--- a/eole/modules/multi_headed_attn.py
+++ b/eole/modules/multi_headed_attn.py
@@ -270,49 +270,33 @@ def __init__(
         is_decoder: bool = True,
         attn_type: str = None,
     ) -> None:
-        assert (
-            model_config.hidden_size % model_config.heads == 0
-        ), "Model dimension must be divisible by the number of heads"
         self.dim_per_head = model_config.hidden_size // model_config.heads
         super(MultiHeadedAttention, self).__init__()
         self.heads = model_config.heads
-        self.num_kv = model_config.num_kv
+        self.heads_kv = (
+            model_config.heads_kv
+            if model_config.heads_kv is not None
+            else model_config.heads
+        )
         self.parallel_gpu = running_config.parallel_gpu
 
-        if model_config.num_kv == 0:
-            assert (
-                model_config.hidden_size % self.parallel_gpu == 0
-            ), "Model dimension must be divisible by the number of partitions"
-            self.linear_keys = skip_init(
-                nn.Linear,
-                in_features=model_config.hidden_size,
-                out_features=model_config.hidden_size // self.parallel_gpu,
-                bias=model_config.add_qkvbias,
-            )
-            self.linear_values = skip_init(
-                nn.Linear,
-                in_features=model_config.hidden_size,
-                out_features=model_config.hidden_size // self.parallel_gpu,
-                bias=model_config.add_qkvbias,
-            )
-        else:
-            assert (
-                self.dim_per_head * self.num_kv
-            ) % self.parallel_gpu == 0, (
-                "Model dimension must be divisible by the number of partitions"
-            )
-            self.linear_keys = skip_init(
-                nn.Linear,
-                in_features=model_config.hidden_size,
-                out_features=self.dim_per_head * self.num_kv // self.parallel_gpu,
-                bias=model_config.add_qkvbias,
-            )
-            self.linear_values = skip_init(
-                nn.Linear,
-                in_features=model_config.hidden_size,
-                out_features=self.dim_per_head * self.num_kv // self.parallel_gpu,
-                bias=model_config.add_qkvbias,
-            )
+        assert (
+            self.dim_per_head * self.heads_kv
+        ) % self.parallel_gpu == 0, (
+            "Model dimension must be divisible by the number of partitions"
+        )
+        self.linear_keys = skip_init(
+            nn.Linear,
+            in_features=model_config.hidden_size,
+            out_features=self.dim_per_head * self.heads_kv // self.parallel_gpu,
+            bias=model_config.add_qkvbias,
+        )
+        self.linear_values = skip_init(
+            nn.Linear,
+            in_features=model_config.hidden_size,
+            out_features=self.dim_per_head * self.heads_kv // self.parallel_gpu,
+            bias=model_config.add_qkvbias,
+        )
         self.linear_query = skip_init(
             nn.Linear,
             in_features=model_config.hidden_size,
@@ -601,7 +585,7 @@ def forward(
                 )
 
         b, h, l, d = key.size()
-        if self.num_kv > 0:
+        if self.heads_kv < self.heads:
             qh = query.size(1)
             # expand key on heads dimension when it's less than query heads (multi-query variant)
             key = key.view(b, -1, 1, l, d).repeat(1, 1, qh // h, 1, 1)

From 6d58cd537ba1cc7cef18eb95d260f17abb68f848 Mon Sep 17 00:00:00 2001
From: vince62s <vince62s@yahoo.com>
Date: Fri, 7 Jun 2024 13:37:45 +0200
Subject: [PATCH 2/3] flake

---
 eole/bin/convert/convert_HF.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/eole/bin/convert/convert_HF.py b/eole/bin/convert/convert_HF.py
index 643455083..3dcaa9265 100755
--- a/eole/bin/convert/convert_HF.py
+++ b/eole/bin/convert/convert_HF.py
@@ -609,8 +609,6 @@ def get_weight(checkpoint, tensor_name):
                                     + param,
                                 )
 
-                                if num_kv == 0:
-                                    num_kv = heads
                                 if w is not None:
                                     if type(source) == tuple:
                                         w = eval("w" + srcmap)

From bf7dfdd7c17108bf387b4184914ec67b4134b7a8 Mon Sep 17 00:00:00 2001
From: vince62s <vince62s@yahoo.com>
Date: Sat, 8 Jun 2024 10:06:44 +0200
Subject: [PATCH 3/3] fix

---
 eole/bin/convert/convert_HF.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/eole/bin/convert/convert_HF.py b/eole/bin/convert/convert_HF.py
index 3dcaa9265..7ccc6f67c 100755
--- a/eole/bin/convert/convert_HF.py
+++ b/eole/bin/convert/convert_HF.py
@@ -445,6 +445,8 @@ def run(cls, args):
         add_qkvbias = False
         add_ffnbias = False
         rotary_interleave = False
+        shared_layer_norm = False
+
         if arch == "PhiForCausalLM":
             parallel_residual = True
             shared_layer_norm = True