UdayLab · udayRage · Jun 17, 2025 · Jun 17, 2025
diff --git a/PAMI/extras/convert/CSV2Parquet.py b/PAMI/extras/convert/CSV2Parquet.py
@@ -120,6 +120,8 @@ def convert(self):
 
         df = pd.DataFrame(file)
 
+        df.columns = [f"{i}" for i in range(maxLen)]  # Assigning generic column names
+
         df.to_parquet(self.outputFile)
 
         self.end = time.time()

diff --git a/PAMI/extras/convert/DF2DB.py b/PAMI/extras/convert/DF2DB.py
@@ -39,6 +39,7 @@
 import sys,psutil,os,time
 from typing import Union
 import operator
+import pandas as pd
 
 condition_operator = {
     '<': operator.lt,
@@ -108,6 +109,50 @@ def __init__(self, inputDF, DFtype='dense') -> None:
         self.items = list(self.inputDF.columns.values)
         self.tids = list(self.inputDF.index)
 
+    def convert2SequentialDatabase(self,
+                                   sep: str = '\t',
+                                   txn_sep: str = '\t',
+                                   seq_sep: str = '-1',
+                                   oFile: str = 'sequential_database.csv') -> None:
+            """
+        Groups transactions by 'customerID' into sequences.
+
+        :param iFile: Output CSV filename.
+        :param sep:   Field separator for the output file.
+        :param txn_sep: Separator between items within a transaction.
+        :param seq_sep: Separator between transactions within a sequence.
+        """
+
+        self.df = self.inputDF
+
+        if 'customerID' not in self.df.columns:
+            raise ValueError("DataFrame must contain 'customerID' column.")
+
+        # Identify item columns (all except 'customerID')
+        item_cols = [c for c in self.df.columns if c != 'customerID']
+
+        # Build sequence strings per customer
+        sequences = []
+        for cust_id, group in self.df.groupby('customerID'):
+            # For each row (transaction), join non-null item values
+            txns = []
+            for _, row in group.iterrows():
+                items = [str(row[col]) for col in item_cols if pd.notna(row[col])]
+                txn_str = txn_sep.join(items)
+                txns.append(txn_str)
+            # Join transactions into one sequence string
+            seq_str = seq_sep.join(txns)
+            sequences.append({'customerID': cust_id, 'sequence': seq_str})
+
+        # Create output DataFrame
+        out_df = pd.DataFrame(sequences)
+
+        # Save to CSV
+        # out_df.to_csv(oFile, sep=sep, index=False, header=False)
+        with open(oFile, 'w') as f:
+            for _, row in out_df.iterrows():
+                f.write(f"{row['sequence']}\n")
+
     def convert2TransactionalDatabase(self, oFile: str, condition: str, thresholdValue: Union[int, float]) -> None:
         """
         create transactional database and return oFileName

diff --git a/PAMI/extras/neighbours/FindNeighboursUsingEuclidean.py b/PAMI/extras/neighbours/FindNeighboursUsingEuclidean.py
@@ -33,6 +33,7 @@
 import time
 import sys, psutil, os,tqdm
 import pandas as pd
+import numpy as np
 
 
 class FindNeighboursUsingEuclidean:
@@ -119,7 +120,7 @@ def save(self,oFile: str) -> None:
             raise ValueError("Run create() before calling save().")
 
         with open(oFile, "w") as f:
-            for i in tqdm(range(self.coords.shape[0])):
+            for i in tqdm.tqdm(range(self.coords.shape[0])):
                 point = self.coords[i]
                 neighbor_mask = self.within_dist[i]
                 if neighbor_mask.any():

diff --git a/PAMI/extras/stats/SymbolicSequentialDatabase.py b/PAMI/extras/stats/SymbolicSequentialDatabase.py
@@ -0,0 +1,42 @@
+import pandas as pd
+from collections import Counter
+import matplotlib.pyplot as plt
+
+
+
+class SymbolicSequentialDatabase:
+    def __init__(self, filename):
+        self.filename = filename
+        self.sequence = ""
+
+    def run(self):
+        # Load file and concatenate all sequences into one string
+        with open(self.filename, 'r') as f:
+            self.sequence = ''.join(f.read().splitlines())
+
+
+        self.length = len(self.sequence)
+        self.symbolCounts = Counter(self.sequence)
+        self.numUniqueSymbols = len(self.symbolCounts)
+
+    def printStats(self):
+        print(f"Total Number of Symbols: {self.numUniqueSymbols}")
+        print(f"Total Size of Sequence:  {self.length}")
+        print("Symbol Frequencies:")
+        for symbol, count in self.symbolCounts.items():
+            print(f"  '{symbol}': {count}")
+
+    def plotGraphs(self):
+        if not hasattr(self, 'symbolCounts'):
+            raise RuntimeError("You must call run() before plotGraphs().")
+
+        symbols, counts = zip(*sorted(self.symbolCounts.items(), key=lambda x: x[1], reverse=True))
+
+        plt.figure(figsize=(12, 6))
+        plt.bar(symbols, counts, width=0.6)
+        plt.title("Symbol Frequency Distribution")
+        plt.xlabel("Symbol")
+        plt.ylabel("Frequency")
+        plt.grid(axis='y', linestyle='--', alpha=0.7)
+        plt.tight_layout()
+        plt.show()
diff --git a/PAMI/extras/stats/sequentialDatabase.py b/PAMI/extras/stats/sequentialDatabase.py
@@ -41,7 +41,8 @@
 import PAMI.extras.graph.plotLineGraphFromDictionary as plt
 import sys
 from typing import List, Dict, Tuple, Set, Union, Any, Generator
-
+# import counter
+from collections import Counter
 
 class sequentialDatabase:
     """
@@ -184,8 +185,9 @@ def readDatabase(self) -> None:
                 with open(self.inputFile, 'r') as f:
                     rowNum = 0
                     for line in f:
-                        temp = [i.rstrip(self.sep) for i in line.split('-1')]
+                        temp = [i.rstrip(self.sep) for i in line.strip().split('-1')]
                         temp = [x for x in temp if x]
+
                         temp.pop()
                         seq = []
                         self.seqLengthList.append(len(temp))
@@ -205,6 +207,7 @@ def readDatabase(self) -> None:
                             self.database[rowNum] = seq
 
 
+
     def getDatabaseSize(self) -> int:
         """
         get the size of database
@@ -236,6 +239,7 @@ def getAverageSubsequencePerSequenceLength(self) -> float:
         :rtype: float
         """
         totalLength = sum(self.seqLengthList)
+        print(f"Total Length of all sequences: {totalLength} and number of sequences: {len(self.database)}")
         return totalLength / len(self.database)
 
     def getAverageItemPerSubsequenceLength(self) -> float:
@@ -412,9 +416,18 @@ def plotGraphs(self) -> None:
         itemFrequencies = self.getFrequenciesInRange()
         seqLen = self.getSequencialLengthDistribution()
         subLen=self.getSubsequencialLengthDistribution()
-        plt.plotLineGraphFromDictionary(itemFrequencies, 100, 'Frequency', 'No of items', 'frequency')
-        plt.plotLineGraphFromDictionary(seqLen, 100, 'sequence length', 'sequence length', 'frequency')
-        plt.plotLineGraphFromDictionary(subLen, 100, 'subsequence length', 'subsequence length', 'frequency')
+
+        custom_counter = Counter()
+        for seq in self.database.values():
+            for sub in seq:
+                custom_counter.update(sub)
+
+        # print(custom_counter)
+
+        # print(itemFrequencies)
+        plt.plotLineGraphFromDictionary(custom_counter, 100, 0, 'No of items', 'frequency')
+        plt.plotLineGraphFromDictionary(seqLen, 100, 0, 'sequence length', 'frequency')
+        plt.plotLineGraphFromDictionary(subLen, 100, 0, 'subsequence length', 'frequency')
 
 if __name__ == '__main__':
     _ap=str()