From 147fb923d3e4a960daafe55ed3f02b14f9d48693 Mon Sep 17 00:00:00 2001 From: CarlaMue Date: Wed, 11 Feb 2026 14:52:29 +0100 Subject: [PATCH] reformatted readme --- README.md | 24 ++++++++++++------------ src/attention_model.py | 2 +- src/util.py | 2 +- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 64ccca5..07f0f2e 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,23 @@ # Language Modelling Exercise This exercsie will allow you to explore language modelling. We focus on the key concept of multi-head attention. -Navigate to the `src/attention_model.py`-file and implement multi-head attention [1] -``` math -\text{Attention}(\mathbf{Q},\mathbf{K},\mathbf{V}) = \text{softmax}(\frac{\mathbf{Q}\mathbf{K}^T}{\sqrt{d_k}})\mathbf{V} -``` +1. Navigate to the `src/attention_model.py`-file and implement multi-head attention [1] -To make attention useful in a language modelling scenario we cannot use future information. A model without access to upcoming future inputs or words is known as causal. -Since our attention matrix is multiplied from the left we must mask out the upper triangle -excluding the main diagonal for causality. + ``` math + \text{Attention}(\mathbf{Q},\mathbf{K},\mathbf{V}) = \text{softmax}(\frac{\mathbf{Q}\mathbf{K}^T}{\sqrt{d_k}})\mathbf{V} + ``` -Keep in mind that $\mathbf{Q} \in \mathbb{R}^{b,h,o,d_k}$, $\mathbf{K} \in \mathbb{R}^{b,h,o,d_k}$ and $\mathbf{V} \in \mathbb{R}^{b,h,o,d_v}$, with $b$ the batch size, $h$ the number of heads, $o$ the desired output dimension, $d_k$ the key dimension and finally $d_v$ as value dimension. Your code must rely on broadcasting to process the matrix operations correctly. The notation follows [1]. + To make attention useful in a language modelling scenario we cannot use future information. A model without access to upcoming future inputs or words is known as causal. + Since our attention matrix is multiplied from the left we must mask out the upper triangle + excluding the main diagonal for causality. -Furthermore write a function to convert the network output of vector encodings back into a string by completing the `convert` function in `src/util.py`. + Keep in mind that $\mathbf{Q} \in \mathbb{R}^{b,h,o,d_k}$, $\mathbf{K} \in \mathbb{R}^{b,h,o,d_k}$ and $\mathbf{V} \in \mathbb{R}^{b,h,o,d_v}$, with $b$ the batch size, $h$ the number of heads, $o$ the desired output dimension, $d_k$ the key dimension and finally $d_v$ as value dimension. Your code must rely on broadcasting to process the matrix operations correctly. The notation follows [1]. +2. Furthermore write a function to convert the network output of vector encodings back into a string by completing the `convert` function in `src/util.py`. + +2. Once you have implemented and tested your version of attention run `sbatch scripts/train.slurm` to train your model on Bender. Once converged you can generate poetry via `sbatch scripts/generate.slurm`. +Run `src/model_chat.py` to talk to your model. [1] Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin: Attention is All you Need. NIPS 2017: 5998-6008 - -Once you have implemented and tested your version of attention run `sbatch scripts/train.slurm` to train your model on Bender. Once converged you can generate poetry via `sbatch scripts/generate.slurm`. -Run `src/model_chat.py` to talk to your model. diff --git a/src/attention_model.py b/src/attention_model.py index 0446f9b..f5b1ec5 100644 --- a/src/attention_model.py +++ b/src/attention_model.py @@ -27,7 +27,7 @@ def dot_product_attention( Returns: torch.Tensor: The attention values of shape [batch, heads, out_length, d_v] """ - # TODO implement multi head attention. + # 1. TODO: implement multi head attention. # Hint: You will likely need torch.transpose, torch.sqrt, torch.tril, # torch.inf, and torch.nn.functional.softmax. # For applying the causal mask, you can either try using torch.exp or torch.masked_fill. diff --git a/src/util.py b/src/util.py index 3bef9bc..734aa5e 100644 --- a/src/util.py +++ b/src/util.py @@ -98,5 +98,5 @@ def convert(sequences: torch.Tensor, inv_vocab: dict) -> list: list: A list of characters. """ res = [] - # TODO: Return a nested list of characters. + # 2. TODO: Return a nested list of characters. return res