From 8724d9737be85c762c03ddd7eececc71996beffd Mon Sep 17 00:00:00 2001
From: Jeff Kinnison <jeffery.kinnison@predibase.com>
Date: Fri, 22 Sep 2023 16:28:03 -0400
Subject: [PATCH 1/4] do not skip batches for batch size 1

---
 ludwig/data/batcher/random_access.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ludwig/data/batcher/random_access.py b/ludwig/data/batcher/random_access.py
index 91c53b9eacc..2326a1be66e 100644
--- a/ludwig/data/batcher/random_access.py
+++ b/ludwig/data/batcher/random_access.py
@@ -72,7 +72,7 @@ def last_batch(self):
         elif self.ignore_last and self.step:
             # index += batch_size after each epoch. So, if our current index in total dataset is 1 less than the total
             # dataset size, then the last batch will only have 1 row. Drop it if this happens.
-            if self.index - self.total_size == -1:
+            if self.batch_size > 1 and self.index - self.total_size == -1:
                 logger.info("Last batch in epoch only has 1 sample and will be dropped.")
                 return True
         return False

From 6ad55ba59042c6147fd4d0ab6862430596ff6496 Mon Sep 17 00:00:00 2001
From: Jeff Kinnison <jeffery.kinnison@predibase.com>
Date: Fri, 22 Sep 2023 16:28:35 -0400
Subject: [PATCH 2/4] do not skip last batch for LLMs

---
 ludwig/trainers/trainer.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/ludwig/trainers/trainer.py b/ludwig/trainers/trainer.py
index af93167e957..c02e411c28e 100644
--- a/ludwig/trainers/trainer.py
+++ b/ludwig/trainers/trainer.py
@@ -31,7 +31,7 @@
 import torch
 from torch.utils.tensorboard import SummaryWriter
 
-from ludwig.constants import AUTO, LOSS, MAX_CPU_BATCH_SIZE, MINIMIZE, MODEL_ECD, TEST, TRAINING, VALIDATION
+from ludwig.constants import AUTO, LOSS, MAX_CPU_BATCH_SIZE, MINIMIZE, MODEL_ECD, MODEL_LLM, TEST, TRAINING, VALIDATION
 from ludwig.data.dataset.base import Dataset
 from ludwig.distributed.base import DistributedStrategy, LocalStrategy
 from ludwig.globals import (
@@ -848,7 +848,9 @@ def train(
                 should_shuffle=self.should_shuffle,
                 random_seed=self.random_seed,
                 distributed=self.distributed,
-                ignore_last=True,
+                ignore_last=(
+                    self.model.type() != MODEL_LLM
+                ),  # LLMs default to batch size 1, skip_last always skips a batch
                 augmentation_pipeline=self.model.get_augmentation_pipelines(),
             ) as batcher:
                 # ================ Training Loop ================
@@ -1157,7 +1159,9 @@ def train_online(self, dataset):
             batch_size=self.batch_size,
             should_shuffle=self.should_shuffle,
             distributed=self.distributed,
-            ignore_last=True,
+            ignore_last=(
+                self.model.type() != MODEL_LLM
+            ),  # LLMs default to batch size 1, skip_last always skips a batch,
         ) as batcher:
             # training step loop
             progress_bar_config = {

From 37e6e08a0f60fb2aca41fbe34d78154c3926fca0 Mon Sep 17 00:00:00 2001
From: Justin Zhao <justinxzhao@gmail.com>
Date: Mon, 25 Sep 2023 21:05:24 +0000
Subject: [PATCH 3/4] Set ignore_last=True always, and add a docstring.

---
 ludwig/data/batcher/random_access.py |  7 ++++++-
 ludwig/trainers/trainer.py           | 10 +++-------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/ludwig/data/batcher/random_access.py b/ludwig/data/batcher/random_access.py
index 2326a1be66e..108f4247c2d 100644
--- a/ludwig/data/batcher/random_access.py
+++ b/ludwig/data/batcher/random_access.py
@@ -63,6 +63,10 @@ def next_batch(self):
         return sub_batch
 
     def last_batch(self):
+        """Returns whether we've exhausted all batches for this epoch.
+
+        If False, then there is at least 1 more batch available with next_batch().
+        """
         # If our current index in the dataset exceeds the size of the dataset,
         # we've finished the epoch and can indicate that this is the last batch
         if self.index >= self.total_size:
@@ -71,7 +75,8 @@ def last_batch(self):
         # For e.g., batch size = 128 but the dataset only has 100 rows.
         elif self.ignore_last and self.step:
             # index += batch_size after each epoch. So, if our current index in total dataset is 1 less than the total
-            # dataset size, then the last batch will only have 1 row. Drop it if this happens.
+            # dataset size, then the last batch will only have 1 row.
+            # If this happens, we drop the last batch, unless batch_size is 1.
             if self.batch_size > 1 and self.index - self.total_size == -1:
                 logger.info("Last batch in epoch only has 1 sample and will be dropped.")
                 return True
diff --git a/ludwig/trainers/trainer.py b/ludwig/trainers/trainer.py
index c02e411c28e..af93167e957 100644
--- a/ludwig/trainers/trainer.py
+++ b/ludwig/trainers/trainer.py
@@ -31,7 +31,7 @@
 import torch
 from torch.utils.tensorboard import SummaryWriter
 
-from ludwig.constants import AUTO, LOSS, MAX_CPU_BATCH_SIZE, MINIMIZE, MODEL_ECD, MODEL_LLM, TEST, TRAINING, VALIDATION
+from ludwig.constants import AUTO, LOSS, MAX_CPU_BATCH_SIZE, MINIMIZE, MODEL_ECD, TEST, TRAINING, VALIDATION
 from ludwig.data.dataset.base import Dataset
 from ludwig.distributed.base import DistributedStrategy, LocalStrategy
 from ludwig.globals import (
@@ -848,9 +848,7 @@ def train(
                 should_shuffle=self.should_shuffle,
                 random_seed=self.random_seed,
                 distributed=self.distributed,
-                ignore_last=(
-                    self.model.type() != MODEL_LLM
-                ),  # LLMs default to batch size 1, skip_last always skips a batch
+                ignore_last=True,
                 augmentation_pipeline=self.model.get_augmentation_pipelines(),
             ) as batcher:
                 # ================ Training Loop ================
@@ -1159,9 +1157,7 @@ def train_online(self, dataset):
             batch_size=self.batch_size,
             should_shuffle=self.should_shuffle,
             distributed=self.distributed,
-            ignore_last=(
-                self.model.type() != MODEL_LLM
-            ),  # LLMs default to batch size 1, skip_last always skips a batch,
+            ignore_last=True,
         ) as batcher:
             # training step loop
             progress_bar_config = {

From 03bff769eabae52b00233c619ff9ce214a055301 Mon Sep 17 00:00:00 2001
From: Justin Zhao <justinxzhao@gmail.com>
Date: Tue, 26 Sep 2023 17:09:03 -0400
Subject: [PATCH 4/4] Add a test.

---
 tests/integration_tests/test_api.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/tests/integration_tests/test_api.py b/tests/integration_tests/test_api.py
index 232217ab7f4..ec556348e04 100644
--- a/tests/integration_tests/test_api.py
+++ b/tests/integration_tests/test_api.py
@@ -589,6 +589,34 @@ def test_api_callbacks_fixed_train_steps(tmpdir, csv_filename):
     assert mock_callback.on_epoch_start.call_count == 10
 
 
+def test_api_callbacks_batch_size_1(tmpdir, csv_filename):
+    epochs = 2
+    batch_size = 1
+    num_examples = 80
+    mock_callback = mock.Mock(wraps=Callback())
+
+    input_features = [sequence_feature(encoder={"reduce_output": "sum"})]
+    output_features = [category_feature(decoder={"vocab_size": 5}, reduce_input="sum")]
+    config = {
+        "input_features": input_features,
+        "output_features": output_features,
+        "combiner": {"type": "concat", "output_size": 14},
+        TRAINER: {"epochs": epochs, "batch_size": batch_size},
+    }
+    model = LudwigModel(config, callbacks=[mock_callback])
+    model.train(
+        training_set=generate_data(
+            input_features, output_features, os.path.join(tmpdir, csv_filename), num_examples=num_examples
+        )
+    )
+
+    # There are exactly 2 epoch starts, even with batch_size = 1.
+    assert mock_callback.on_epoch_start.call_count == 2
+    assert mock_callback.on_epoch_end.call_count == 2
+    assert mock_callback.on_batch_start.call_count == 160
+    assert mock_callback.on_batch_end.call_count == 160
+
+
 def test_api_callbacks_fixed_train_steps_less_than_one_epoch(tmpdir, csv_filename):
     # If train_steps is set manually, epochs is ignored.
     train_steps = total_batches = 6