8000 Refactor Jupyter notebooks for chapters 10-17: update content, add pa… by Tarun-Sreepada · Pull Request #572 · UdayLab/PAMI · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Refactor Jupyter notebooks for chapters 10-17: update content, add pa… #572

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions PAMI/extras/convert/CSV2Parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,8 @@ def convert(self):

df = pd.DataFrame(file)

df.columns = [f"{i}" for i in range(maxLen)] # Assigning generic column names

df.to_parquet(self.outputFile)

self.end = time.time()
Expand Down
45 changes: 45 additions & 0 deletions PAMI/extras/convert/DF2DB.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
import sys,psutil,os,time
from typing import Union
import operator
import pandas as pd

condition_operator = {
'<': operator.lt,
Expand Down Expand Up @@ -108,6 +109,50 @@ def __init__(self, inputDF, DFtype='dense') -> None:
self.items = list(self.inputDF.columns.values)
self.tids = list(self.inputDF.index)

def convert2SequentialDatabase(self,
sep: str = '\t',
txn_sep: str = '\t',
seq_sep: str = '-1',
oFile: str = 'sequential_database.csv') -> None:
"""
Groups transactions by 'customerID' into sequences.

:param iFile: Output CSV filename.
:param sep: Field separator for the output file.
:param txn_sep: Separator between items within a transaction.
:param seq_sep: Separator between transactions within a sequence.
"""

self.df = self.inputDF

if 'customerID' not in self.df.columns:
raise ValueError("DataFrame must contain 'customerID' column.")

# Identify item columns (all except 'customerID')
item_cols = [c for c in self.df.columns if c != 'customerID']

# Build sequence strings per customer
sequences = []
for cust_id, group in self.df.groupby('customerID'):
# For each row (transaction), join non-null item values
txns = []
for _, row in group.iterrows():
items = [str(row[col]) for col in item_cols if pd.notna(row[col])]
txn_str = txn_sep.join(items)
txns.append(txn_str)
# Join transactions into one sequence string
seq_str = seq_sep.join(txns)
sequences.append({'customerID': cust_id, 'sequence': seq_str})

# Create output DataFrame
out_df = pd.DataFrame(sequences)

# Save to CSV
# out_df.to_csv(oFile, sep=sep, index=False, header=False)
with open(oFile, 'w') as f:
for _, row in out_df.iterrows():
f.write(f"{row['sequence']}\n")

def convert2TransactionalDatabase(self, oFile: str, condition: str, thresholdValue: Union[int, float]) -> None:
"""
create transactional database and return oFileName
Expand Down
3 changes: 2 additions & 1 deletion PAMI/extras/neighbours/FindNeighboursUsingEuclidean.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import time
import sys, psutil, os,tqdm
import pandas as pd
import numpy as np


class FindNeighboursUsingEuclidean:
Expand Down Expand Up @@ -119,7 +120,7 @@ def save(self,oFile: str) -> None:
raise ValueError("Run create() before calling save().")

with open(oFile, "w") as f:
for i in tqdm(range(self.coords.shape[0])):
for i in tqdm.tqdm(range(self.coords.shape[0])):
point = self.coords[i]
neighbor_mask = self.within_dist[i]
if neighbor_mask.any():
Expand Down
42 changes: 42 additions & 0 deletions PAMI/extras/stats/SymbolicSequentialDatabase.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt



class SymbolicSequentialDatabase:
def __init__(self, filename):
self.filename = filename
self.sequence = ""

def run(self):
# Load file and concatenate all sequences into one string
with open(self.filename, 'r') as f:
self.sequence = ''.join(f.read().splitlines())


self.length = len(self.sequence)
self.symbolCounts = Counter(self.sequence)
self.numUniqueSymbols = len(self.symbolCounts)

def printStats(self):
print(f"Total Number of Symbols: {self.numUniqueSymbols}")
print(f"Total Size of Sequence: {self.length}")
print("Symbol Frequencies:")
for symbol, count in self.symbolCounts.items():
print(f" '{symbol}': {count}")

def plotGraphs(self):
if not hasattr(self, 'symbolCounts'):
raise RuntimeError("You must call run() before plotGraphs().")

symbols, counts = zip(*sorted(self.symbolCounts.items(), key=lambda x: x[1], reverse=True))

plt.figure(figsize=(12, 6))
plt.bar(symbols, counts, width=0.6)
plt.title("Symbol Frequency Distribution")
plt.xlabel("Symbol")
plt.ylabel("Frequency")
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
23 changes: 18 additions & 5 deletions PAMI/extras/stats/sequentialDatabase.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@
import PAMI.extras.graph.plotLineGraphFromDictionary as plt
import sys
from typing import List, Dict, Tuple, Set, Union, Any, Generator

# import counter
from collections import Counter

class sequentialDatabase:
"""
Expand Down Expand Up @@ -184,8 +185,9 @@ def readDatabase(self) -> None:
with open(self.inputFile, 'r') as f:
rowNum = 0
for line in f:
temp = [i.rstrip(self.sep) for i in line.split('-1')]
temp = [i.rstrip(self.sep) for i in line.strip().split('-1')]
temp = [x for x in temp if x]

temp.pop()
seq = []
self.seqLengthList.append(len(temp))
Expand All @@ -205,6 +207,7 @@ def readDatabase(self) -> None:
self.database[rowNum] = seq



def getDatabaseSize(self) -> int:
"""
get the size of database
Expand Down Expand Up @@ -236,6 +239,7 @@ def getAverageSubsequencePerSequenceLength(self) -> float:
:rtype: float
"""
totalLength = sum(self.seqLengthList)
print(f"Total Length of all sequences: {totalLength} and number of sequences: {len(self.database)}")
return totalLength / len(self.database)

def getAverageItemPerSubsequenceLength(self) -> float:
Expand Down Expand Up @@ -412,9 +416,18 @@ def plotGraphs(self) -> None:
itemFrequencies = self.getFrequenciesInRange()
seqLen = self.getSequencialLengthDistribution()
subLen=self.getSubsequencialLengthDistribution()
plt.plotLineGraphFromDictionary(itemFrequencies, 100, 'Frequency', 'No of items', 'frequency')
plt.plotLineGraphFromDictionary(seqLen, 100, 'sequence length', 'sequence length', 'frequency')
plt.plotLineGraphFromDictionary(subLen, 100, 'subsequence length', 'subsequence length', 'frequency')

custom_counter = Counter()
for seq in self.database.values():
for sub in seq:
custom_counter.update(sub)

# print(custom_counter)

# print(itemFrequencies)
plt.plotLineGraphFromDictionary(custom_counter, 100, 0, 'No of items', 'frequency')
plt.plotLineGraphFromDictionary(seqLen, 100, 0, 'sequence length', 'frequency')
plt.plotLineGraphFromDictionary(subLen, 100, 0, 'subsequence length', 'frequency')

if __name__ == '__main__':
_ap=str()
Expand Down
Loading
0