diff --git a/load_tinystories.py b/load_tinystories.py index 56e3a88..606b42c 100644 --- a/load_tinystories.py +++ b/load_tinystories.py @@ -217,16 +217,18 @@ def sequence_to_words(sequence, N=12): words = [''.join(bits[i:i + N]) for i in range(0, len(bits), N)] return words -def calculate_energy(sequences, batch_size=32): +def calculate_energy(sequences, batch_size=32, h=0.1): """ - Calculate the energy of sequences using batched processing. - Returns energies and Hamiltonian matrix. + Calculate the energy of sequences using batched processing with magnetic field. + Returns energies and weight matrix W. + h: magnetic field strength """ num_sequences = len(sequences) seq_length = sequences[0].shape[0] - # Initialize Hamiltonian matrix - hamiltonian = np.zeros((seq_length, seq_length)) + # Initialize weight matrix and magnetic field + W = np.zeros((seq_length, seq_length)) + h_field = h * np.ones(seq_length).reshape(-1, 1) # Uniform magnetic field energies = [] # Process sequences in batches @@ -234,37 +236,42 @@ def calculate_energy(sequences, batch_size=32): batch = sequences[i:min(i + batch_size, num_sequences)] batch = np.array(batch) # Convert batch to numpy array - # Calculate batch energies - batch_energies = np.sum(batch * batch.transpose(0, 2, 1), axis=(1, 2)) / -2 + # Calculate batch contribution to weight matrix (Hebbian learning) + for seq in batch: + W += np.dot(seq, seq.T) + + # Calculate batch energies including magnetic field + batch_energies = [] + for seq in batch: + # E = -1/2 * s^T * W * s - h * sum(s) + # Properly extract scalar values from matrix multiplications + spin_spin_matrix = seq.T.dot(W).dot(seq) + spin_spin = -0.5 * float(spin_spin_matrix[0, 0]) + + magnetic_matrix = h_field.T.dot(seq) + magnetic = -float(magnetic_matrix[0, 0]) + + energy = spin_spin + magnetic + batch_energies.append(energy) + energies.extend(batch_energies) - - # Update Hamiltonian - batch_hamiltonian = np.sum(np.matmul(batch, batch.transpose(0, 2, 1)), axis=0) - hamiltonian += batch_hamiltonian - - # Free memory - del batch - del batch_energies - del batch_hamiltonian - # Normalize Hamiltonian - hamiltonian = hamiltonian / num_sequences + # Normalize weight matrix + W = W / num_sequences - return np.array(energies), hamiltonian + return np.array(energies), W, h_field -def retrieve_sequences(sequences, partial_sequence, vocab, W, M=10, N=12, temperature=1.0): +def retrieve_sequences(sequences, partial_sequence, vocab, W, M=10, N=12, temperature=1.0, h=0.1): """ - Retrieve the most likely next word using Ising Hamiltonian with temperature. - Uses associative memory to retrieve the last word of the sequence. + Retrieve the most likely next word using Ising Hamiltonian with magnetic field. """ - # Convert partial sequence to vector - partial_vec = np.array([int(bit) for bit in partial_sequence]).reshape(-1, 1) - # Get all possible words from vocabulary possible_words = list(vocab.values()) - # Calculate weights matrix (Hebbian learning) - # Calculate energies for all possible words + # Create magnetic field + h_field = h * np.ones(M * N).reshape(-1, 1) + + # Calculate energies for all possible completions word_energies = [] for word in possible_words: @@ -273,29 +280,36 @@ def retrieve_sequences(sequences, partial_sequence, vocab, W, M=10, N=12, temper if len(complete_sequence) == M*N: # Ensure correct length complete_vec = np.array([int(bit) for bit in complete_sequence]).reshape(M * N, 1) - # Calculate energy using Ising Hamiltonian - energy_matrix = complete_vec.T.dot(W).dot(complete_vec) - energy = float(energy_matrix[0, 0]) + # Calculate energy with both interaction and magnetic field terms + spin_spin = 0 + for seq in sequences: + # Properly extract scalar from matrix multiplication + overlap_matrix = complete_vec.T.dot(seq) + overlap = overlap_matrix[0, 0] # Extract single scalar value + spin_spin -= overlap * overlap - word_energies.append((word, energy)) + # Extract scalar from magnetic field contribution + magnetic_matrix = h_field.T.dot(complete_vec) + magnetic = -float(magnetic_matrix[0, 0]) + total_energy = spin_spin + magnetic + + word_energies.append((word, total_energy)) # Sort by energy word_energies.sort(key=lambda x: x[1]) - # Normalize energies to prevent overflow + # Normalize energies energies = np.array([e[1] for e in word_energies]) - energies = energies - np.min(energies) # Shift to make minimum energy 0 - energies = energies / np.max(energies) if np.max(energies) > 0 else energies # Scale to [0,1] + energies = energies - np.min(energies) + max_energy = np.max(energies) + if max_energy > 0: + energies = energies / max_energy - # Calculate probabilities with normalized energies + # Calculate probabilities with Boltzmann distribution probabilities = np.exp(-energies/temperature) probabilities = probabilities / np.sum(probabilities) - # Check for valid probabilities - if np.any(np.isnan(probabilities)): - # Fallback to uniform distribution if numerical issues occur - probabilities = np.ones(len(word_energies)) / len(word_energies) - + # Sample from distribution selected_idx = np.random.choice(len(word_energies), p=probabilities) best_word, min_energy = word_energies[selected_idx] @@ -340,9 +354,10 @@ def predict_sequence(initial_sequence, vocab, sequences, W, D=10, M=100, N=12, t if __name__ == "__main__": N = 20 # Define N as a constant - M = 20 # Define M as a constant + M = 100 # Define M as a constant D = 10 # Number of words to predict - temperature = 0.10 + temperature = 1 + batch_size = 50 # Added batch size parameter print("Loading and encoding stories...") @@ -360,7 +375,7 @@ if __name__ == "__main__": # Get initial sequence from first story story_tokens = tokenize_with_punctuation(original_stories[0]) - _, W = calculate_energy(sequences) + _, W, _ = calculate_energy(sequences) # Make sure we have enough tokens for M=100 if len(story_tokens) >= M-1: