from src.utils.libimports.varseq import *
from src.utils.sampling.batchsampler import EqualLengthsBatchSampler

data_classification = [
    ([ 6, 17, 18, 25,  9, 11,  7, 26,  6, 12,  7,  8], 0),
    ([13, 17, 14, 15,  9,  6, 12,  7, 16, 19, 10,  8], 0),
    ([ 6, 15,  9, 11,  7, 18, 19, 10,  6, 20,  8], 0),
    ([11,  7, 14, 21, 27, 12,  7, 14, 21,  8], 0),
    ([ 6, 15, 28, 29, 22, 23, 16,  6, 10,  8], 1),
    ([13, 10,  9,  6, 22, 16, 13, 10,  9, 30, 23,  8], 1),
    ([24, 20,  1, 24,  9,  1,  7,  8], 1),
    ([12, 13,  4, 15, 18,  2,  4, 10,  8], 1)
]
# Extract all sequences and convert each sequence to a tensor of long values
sequences = [ torch.LongTensor(sample[0]) for sample in data_classification ]
# Extract targets (i.e., class labels for a binary classificaton task)
targets = torch.LongTensor([ sample[1] for sample in data_classification ])

data_seq2seq = [
    ([1, 2, 3], [1, 2, 3, 4]),
    ([1, 2, 3], [1, 2, 3, 4]),
    ([1, 2, 3], [1, 2, 3, 4]),
    ([1, 2, 3], [1, 2, 3, 4]),
    ([1, 2, 3], [1, 2, 3, 4]),
    ([1, 2, 3], [1, 2, 3, 4]),
    ([1, 2, 3], [1, 2, 3, 4]),
    ([1, 2, 3, 4], [1, 2, 3, 4]),
    ([1, 2, 3, 4], [1, 2, 3, 4]),
    ([1, 2, 3, 4], [1, 2, 3, 4]),
    ([1, 2, 3, 4], [1, 2]),
    ([1, 2, 3, 4], [1, 2]),
    ([1, 2, 3, 4], [1, 2]),
    ([1, 2, 3, 4], [1, 2]),
    ([1, 2, 3, 4], [1, 2]),
    ([1, 2, 3, 4], [1, 2]),
    ([1, 2, 3], [1, 2, 3, 4, 5]),
    ([1, 2, 3], [1, 2, 3, 4, 5]),
    ([1, 2, 3], [1, 2, 3, 4, 5]),
    ([1, 2, 3], [1, 2, 3, 4, 5]),
    ([1, 2, 3], [1, 2, 3, 4, 5]),
    ([1, 2, 3], [1, 2, 3, 4, 5]),
    ([1, 2, 3], [1, 2, 3, 4, 5]),
    ([1, 2, 3], [1, 2, 3, 4, 5]),
    ([1, 2, 3], [1, 2, 3, 4, 5]),
]
# Extract input and target sequences
input_sequences  = [ tup[0] for tup in data_seq2seq ]
target_sequences = [ tup[1] for tup in data_seq2seq ]

sequences_padded = pad_sequence(sequences, batch_first=True, padding_value=0)

print(sequences_padded)
print(sequences_padded.shape)

tensor([[ 6, 17, 18, 25,  9, 11,  7, 26,  6, 12,  7,  8],
        [13, 17, 14, 15,  9,  6, 12,  7, 16, 19, 10,  8],
        [ 6, 15,  9, 11,  7, 18, 19, 10,  6, 20,  8,  0],
        [11,  7, 14, 21, 27, 12,  7, 14, 21,  8,  0,  0],
        [ 6, 15, 28, 29, 22, 23, 16,  6, 10,  8,  0,  0],
        [13, 10,  9,  6, 22, 16, 13, 10,  9, 30, 23,  8],
        [24, 20,  1, 24,  9,  1,  7,  8,  0,  0,  0,  0],
        [12, 13,  4, 15, 18,  2,  4, 10,  8,  0,  0,  0]])
torch.Size([8, 12])

sequences_padded_left = pad_sequence(sequences, batch_first=True, padding_value=0, padding_side='left')

print(sequences_padded_left)
print(sequences_padded_left.shape)

# We only run this line since we want to assume right-padding for subsequent code cells
sequences_padded = pad_sequence(sequences, batch_first=True, padding_value=0, padding_side="right") # default side

tensor([[ 6, 17, 18, 25,  9, 11,  7, 26,  6, 12,  7,  8],
        [13, 17, 14, 15,  9,  6, 12,  7, 16, 19, 10,  8],
        [ 0,  6, 15,  9, 11,  7, 18, 19, 10,  6, 20,  8],
        [ 0,  0, 11,  7, 14, 21, 27, 12,  7, 14, 21,  8],
        [ 0,  0,  6, 15, 28, 29, 22, 23, 16,  6, 10,  8],
        [13, 10,  9,  6, 22, 16, 13, 10,  9, 30, 23,  8],
        [ 0,  0,  0,  0, 24, 20,  1, 24,  9,  1,  7,  8],
        [ 0,  0,  0, 12, 13,  4, 15, 18,  2,  4, 10,  8]])
torch.Size([8, 12])

FIXED_LENGTH = 5

sequences_padded_truncated = sequences_padded[:,:FIXED_LENGTH]

print(sequences_padded_truncated)

tensor([[ 6, 17, 18, 25,  9],
        [13, 17, 14, 15,  9],
        [ 6, 15,  9, 11,  7],
        [11,  7, 14, 21, 27],
        [ 6, 15, 28, 29, 22],
        [13, 10,  9,  6, 22],
        [24, 20,  1, 24,  9],
        [12, 13,  4, 15, 18]])

TARGET_LENGTH = 15

pad_size = TARGET_LENGTH - sequences_padded.shape[1]

#sequences_max_padded = torch.nn.functional.pad(sequences_padded, (pad_size, 0, 0, 0), mode="constant", value=0) # left padding
sequences_max_padded = torch.nn.functional.pad(sequences_padded, (0, pad_size, 0, 0), mode="constant", value=0) # right padding

print (sequences_max_padded)

tensor([[ 6, 17, 18, 25,  9, 11,  7, 26,  6, 12,  7,  8,  0,  0,  0],
        [13, 17, 14, 15,  9,  6, 12,  7, 16, 19, 10,  8,  0,  0,  0],
        [ 6, 15,  9, 11,  7, 18, 19, 10,  6, 20,  8,  0,  0,  0,  0],
        [11,  7, 14, 21, 27, 12,  7, 14, 21,  8,  0,  0,  0,  0,  0],
        [ 6, 15, 28, 29, 22, 23, 16,  6, 10,  8,  0,  0,  0,  0,  0],
        [13, 10,  9,  6, 22, 16, 13, 10,  9, 30, 23,  8,  0,  0,  0],
        [24, 20,  1, 24,  9,  1,  7,  8,  0,  0,  0,  0,  0,  0,  0],
        [12, 13,  4, 15, 18,  2,  4, 10,  8,  0,  0,  0,  0,  0,  0]])

def create_fixed_length_batch(sequences, target_length, padding_value=0, padding_side="right"):
    
    # Pad sequences w.r.t. longest sequences
    sequences_padded = pad_sequence(sequences, batch_first=True, padding_value=padding_value, padding_side=padding_side)

    # Get the current sequence length
    max_seq_len = sequences_padded.shape[1]
    
    if max_seq_len > target_length:
        # Truncate sequences if too long
        return sequences_padded[:,:target_length]
    else:
        # Pad sequences if too short
        if padding_side == "right":
            pad_tuple = (0, target_length-max_seq_len, 0, 0)
        else:
            pad_tuple = (target_length-max_seq_len, 0, 0, 0)
        return torch.nn.functional.pad(sequences_padded, pad_tuple, mode="constant", value=0)

create_fixed_length_batch(sequences, 5, padding_side="right")
#create_fixed_length_batch(sequences, 5, padding_side="left")

tensor([[ 6, 17, 18, 25,  9],
        [13, 17, 14, 15,  9],
        [ 6, 15,  9, 11,  7],
        [11,  7, 14, 21, 27],
        [ 6, 15, 28, 29, 22],
        [13, 10,  9,  6, 22],
        [24, 20,  1, 24,  9],
        [12, 13,  4, 15, 18]])

create_fixed_length_batch(sequences, 15, padding_side="right")
#create_fixed_length_batch(sequences, 15, padding_side="left")

tensor([[ 6, 17, 18, 25,  9, 11,  7, 26,  6, 12,  7,  8,  0,  0,  0],
        [13, 17, 14, 15,  9,  6, 12,  7, 16, 19, 10,  8,  0,  0,  0],
        [ 6, 15,  9, 11,  7, 18, 19, 10,  6, 20,  8,  0,  0,  0,  0],
        [11,  7, 14, 21, 27, 12,  7, 14, 21,  8,  0,  0,  0,  0,  0],
        [ 6, 15, 28, 29, 22, 23, 16,  6, 10,  8,  0,  0,  0,  0,  0],
        [13, 10,  9,  6, 22, 16, 13, 10,  9, 30, 23,  8,  0,  0,  0],
        [24, 20,  1, 24,  9,  1,  7,  8,  0,  0,  0,  0,  0,  0,  0],
        [12, 13,  4, 15, 18,  2,  4, 10,  8,  0,  0,  0,  0,  0,  0]])

print(sequences_padded)

tensor([[ 6, 17, 18, 25,  9, 11,  7, 26,  6, 12,  7,  8],
        [13, 17, 14, 15,  9,  6, 12,  7, 16, 19, 10,  8],
        [ 6, 15,  9, 11,  7, 18, 19, 10,  6, 20,  8,  0],
        [11,  7, 14, 21, 27, 12,  7, 14, 21,  8,  0,  0],
        [ 6, 15, 28, 29, 22, 23, 16,  6, 10,  8,  0,  0],
        [13, 10,  9,  6, 22, 16, 13, 10,  9, 30, 23,  8],
        [24, 20,  1, 24,  9,  1,  7,  8,  0,  0,  0,  0],
        [12, 13,  4, 15, 18,  2,  4, 10,  8,  0,  0,  0]])

def sort_batch(inputs, targets, lengths):
    # Sort sequences w.r.t. their lengths from longest to shortest
    lengths_sorted, sorted_idx = lengths.sort(descending=True)
    # Return re-ordered inputs and targets, as well as the lengths (sorted from longest to shortest)
    return inputs[sorted_idx], targets[sorted_idx], lengths_sorted

# Extract the lengths for all sequences in the batch
lengths = torch.LongTensor([ len(seq) for seq in sequences ])

# Sort inputs and targets in parallel to ensure they remain aligned
sequences_padded_sorted, targets_sorted, lengths_sorted = sort_batch(sequences_padded, targets, lengths)

print(sequences_padded_sorted)

tensor([0, 0, 0, 0, 1, 1, 1, 1])
tensor([12, 12, 11, 10, 10, 12,  8,  9])
tensor([[ 6, 17, 18, 25,  9, 11,  7, 26,  6, 12,  7,  8],
        [13, 17, 14, 15,  9,  6, 12,  7, 16, 19, 10,  8],
        [13, 10,  9,  6, 22, 16, 13, 10,  9, 30, 23,  8],
        [ 6, 15,  9, 11,  7, 18, 19, 10,  6, 20,  8,  0],
        [11,  7, 14, 21, 27, 12,  7, 14, 21,  8,  0,  0],
        [ 6, 15, 28, 29, 22, 23, 16,  6, 10,  8,  0,  0],
        [12, 13,  4, 15, 18,  2,  4, 10,  8,  0,  0,  0],
        [24, 20,  1, 24,  9,  1,  7,  8,  0,  0,  0,  0]])

sequences_packed = torch.nn.utils.rnn.pack_padded_sequence(sequences_padded_sorted, lengths_sorted, batch_first=True)

print(sequences_packed)

PackedSequence(data=tensor([ 6, 13, 13,  6, 11,  6, 12, 24, 17, 17, 10, 15,  7, 15, 13, 20, 18, 14,
         9,  9, 14, 28,  4,  1, 25, 15,  6, 11, 21, 29, 15, 24,  9,  9, 22,  7,
        27, 22, 18,  9, 11,  6, 16, 18, 12, 23,  2,  1,  7, 12, 13, 19,  7, 16,
         4,  7, 26,  7, 10, 10, 14,  6, 10,  8,  6, 16,  9,  6, 21, 10,  8, 12,
        19, 30, 20,  8,  8,  7, 10, 23,  8,  8,  8,  8]), batch_sizes=tensor([8, 8, 8, 8, 8, 8, 8, 8, 7, 6, 4, 3]), sorted_indices=None, unsorted_indices=None)

sequences_unpacked, lengths_unpacked = torch.nn.utils.rnn.pad_packed_sequence(sequences_packed, batch_first=True)

print(sequences_unpacked)

tensor([[ 6, 17, 18, 25,  9, 11,  7, 26,  6, 12,  7,  8],
        [13, 17, 14, 15,  9,  6, 12,  7, 16, 19, 10,  8],
        [13, 10,  9,  6, 22, 16, 13, 10,  9, 30, 23,  8],
        [ 6, 15,  9, 11,  7, 18, 19, 10,  6, 20,  8,  0],
        [11,  7, 14, 21, 27, 12,  7, 14, 21,  8,  0,  0],
        [ 6, 15, 28, 29, 22, 23, 16,  6, 10,  8,  0,  0],
        [12, 13,  4, 15, 18,  2,  4, 10,  8,  0,  0,  0],
        [24, 20,  1, 24,  9,  1,  7,  8,  0,  0,  0,  0]])

class BaseDataset(Dataset):

    def __init__(self, inputs, targets=None):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        if self.targets is None:
            return np.asarray(self.inputs[index])
        else:
            return np.asarray(self.inputs[index]), np.asarray(self.targets[index])

dataset = BaseDataset(sequences, targets)

#loader = DataLoader(dataset, batch_size=5)

#for X_batch, y_batch in loader:
#    print(X_batch)
#    print(y_batch)

BATCH_SIZE = 5

sampler = EqualLengthsBatchSampler(BATCH_SIZE, sequences, targets)

loader = DataLoader(dataset, batch_sampler=sampler, shuffle=False, drop_last=False)

for batch_nr, (X_batch, y_batch) in enumerate(loader):
    print("========= Batch {} =========".format(batch_nr+1))
    print(X_batch)

========= Batch 1 =========
tensor([[ 6, 15,  9, 11,  7, 18, 19, 10,  6, 20,  8]])
========= Batch 2 =========
tensor([[12, 13,  4, 15, 18,  2,  4, 10,  8]])
========= Batch 3 =========
tensor([[24, 20,  1, 24,  9,  1,  7,  8]])
========= Batch 4 =========
tensor([[ 6, 15, 28, 29, 22, 23, 16,  6, 10,  8],
        [11,  7, 14, 21, 27, 12,  7, 14, 21,  8]])
========= Batch 5 =========
tensor([[13, 17, 14, 15,  9,  6, 12,  7, 16, 19, 10,  8],
        [13, 10,  9,  6, 22, 16, 13, 10,  9, 30, 23,  8],
        [ 6, 17, 18, 25,  9, 11,  7, 26,  6, 12,  7,  8]])

# Create Dateset
dataset_seq2seq = BaseDataset(input_sequences, target_sequences)
# Create BatchSampler
sampler_seq2seq = EqualLengthsBatchSampler(BATCH_SIZE, input_sequences, target_sequences)
# Create DataLoader
loader_seq2seq = DataLoader(dataset_seq2seq, batch_sampler=sampler_seq2seq, shuffle=False, drop_last=False)

for idx, (batch_inputs, batch_targets) in enumerate(loader_seq2seq):
    print("========= BATCH {} =========".format(idx))
    print(batch_inputs)
    print(batch_targets)

========= BATCH 0 =========
tensor([[1, 2, 3],
        [1, 2, 3],
        [1, 2, 3],
        [1, 2, 3]])
tensor([[1, 2, 3, 4],
        [1, 2, 3, 4],
        [1, 2, 3, 4],
        [1, 2, 3, 4]])
========= BATCH 1 =========
tensor([[1, 2, 3],
        [1, 2, 3],
        [1, 2, 3]])
tensor([[1, 2, 3, 4],
        [1, 2, 3, 4],
        [1, 2, 3, 4]])
========= BATCH 2 =========
tensor([[1, 2, 3, 4],
        [1, 2, 3, 4],
        [1, 2, 3, 4]])
tensor([[1, 2],
        [1, 2],
        [1, 2]])
========= BATCH 3 =========
tensor([[1, 2, 3, 4],
        [1, 2, 3, 4],
        [1, 2, 3, 4]])
tensor([[1, 2],
        [1, 2],
        [1, 2]])
========= BATCH 4 =========
tensor([[1, 2, 3, 4],
        [1, 2, 3, 4],
        [1, 2, 3, 4]])
tensor([[1, 2, 3, 4],
        [1, 2, 3, 4],
        [1, 2, 3, 4]])
========= BATCH 5 =========
tensor([[1, 2, 3],
        [1, 2, 3],
        [1, 2, 3],
        [1, 2, 3],
        [1, 2, 3]])
tensor([[1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5]])
========= BATCH 6 =========
tensor([[1, 2, 3],
        [1, 2, 3],
        [1, 2, 3],
        [1, 2, 3]])
tensor([[1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5]])

Working with Batches for Sequence Tasks¶

Setting up the Notebook¶

Make Required Imports¶

Create Example Batches¶

Text Classification¶

Sequence-to-Sequence¶

Approach 1: Padding & Truncating¶

Basic Padding¶

Additional Steps for Convolutional Neural Networks (CNNs)¶

Truncate to Required Length¶

Pad to Require Length¶

Complete Auxiliary Method¶

Additional Steps for Recurrent Neural Networks (RNNs)¶

Approach 2: Enforcing Equal-Length Batches with a Batch Sampler¶

Create Dataset Class¶

Classification Dataset Example¶

Create Dataset¶

Create Batch Sampler¶

Create Data Loader¶

Sequence-to-Sequence (Seq2Seq) Dataset Example¶

Create Dataset, Batch Sampler & Data Loader¶

Summary¶