diff --git a/.gradient/notebook-tests.yaml b/.gradient/notebook-tests.yaml
index 0659cc3..1b54b94 100644
--- a/.gradient/notebook-tests.yaml
+++ b/.gradient/notebook-tests.yaml
@@ -167,4 +167,26 @@ useful-managing-ipu-resources:
   generated: true
   notebook:
     file: managing_ipu_resources.ipynb
-    timeout: 1000
\ No newline at end of file
+    timeout: 1000
+
+# Packed BERT tests
+packed-bert-single-label:
+  location: ../packed-bert/
+  generated: true
+  notebook:
+    file: packedBERT_single_label_text_classification.ipynb
+    timeout: 10000
+
+packed-bert-multi-label:
+  location: ../packed-bert/
+  generated: true
+  notebook:
+    file: packedBERT_multi_label_text_classification.ipynb
+    timeout: 10000
+
+packed-bert-question-answering:
+  location: ../packed-bert/
+  generated: true
+  notebook:
+    file: packedBERT_question_answering.ipynb
+    timeout: 10000
\ No newline at end of file
diff --git a/.gradient/prepare-datasets.sh b/.gradient/prepare-datasets.sh
index 0ff3efc..5b34a09 100755
--- a/.gradient/prepare-datasets.sh
+++ b/.gradient/prepare-datasets.sh
@@ -42,6 +42,30 @@ echo "Starting preparation of datasets"
 # symlink exe_cache files
 exe_cache_source_dir="${PUBLIC_DATASETS_DIR}/poplar-executables-hf-3-1"
 symlink-public-resources "${exe_cache_source_dir}" $POPLAR_EXECUTABLE_CACHE_DIR
+
+# packed bert executables
+packed_sl_exe_cache_source_dir="${PUBLIC_DATASETS_DIR}/packed_bert_slseqcls_exe_cache/packed_bert_slseqcls"
+symlink-public-resources "${packed_exe_cache_source_dir}" "${POPLAR_EXECUTABLE_CACHE_DIR}/packed_bert_slseqcls_exe_cache"
+packed_ml_exe_cache_source_dir="${PUBLIC_DATASETS_DIR}/packed_bert_mlseqcls_exe_cache/packed_bert_mlseqcls"
+symlink-public-resources "${packed_exe_cache_source_dir}" "${POPLAR_EXECUTABLE_CACHE_DIR}/packed_bert_mlseqcls_exe_cache"
+packed_qa_exe_cache_source_dir="${PUBLIC_DATASETS_DIR}/packed_bert_qa_exe_cache/packed_bert_squad"
+symlink-public-resources "${packed_exe_cache_source_dir}" "${POPLAR_EXECUTABLE_CACHE_DIR}/packed_bert_qa_exe_cache"
+
+# packed bert datasets
+packed_sl_dataset_source_dir="${PUBLIC_DATASETS_DIR}/packed_bert_slseqcls_dataset_cache"
+symlink-public-resources "${packed_exe_cache_source_dir}" "${HF_DATASETS}/packed_bert_slseqcls_dataset_cache"
+packed_ml_dataset_source_dir="${PUBLIC_DATASETS_DIR}/packed_bert_mlseqcls_dataset_cache"
+symlink-public-resources "${packed_exe_cache_source_dir}" "${POPLAR_EXECUTABLE_CACHE_DIR}/packed_bert_mlseqcls_dataset_cache"
+packed_qa_dataset_source_dir="${PUBLIC_DATASETS_DIR}/packed_bert_qa_dataset_cache"
+symlink-public-resources "${packed_exe_cache_source_dir}" "${POPLAR_EXECUTABLE_CACHE_DIR}/packed_bert_qa_dataset_cache"
+
+# packed bert inference checkpoints
+symlink-public-resources "${PUBLIC_DATASETS_DIR}/bert-base-uncased-sst2" "${CHECKPOINT_DIR}/bert-base-uncased-sst2"
+symlink-public-resources "${PUBLIC_DATASETS_DIR}/bert-base-uncased-go_emotions" "${CHECKPOINT_DIR}/bert-base-uncased-go_emotions"
+symlink-public-resources "${PUBLIC_DATASETS_DIR}/bert-base-uncased-squad" "${CHECKPOINT_DIR}/bert-base-uncased-squad"
+
+
+
 # symlink HF datasets
 HF_DATASETS="conll2003 glue imagefolder librispeech_asr squad swag wikitext wmt16 xsum"
 for dataset in ${HF_DATASETS}; do
@@ -50,6 +74,7 @@ for dataset in ${HF_DATASETS}; do
 done
 # Image classification dataset
 symlink-public-resources "${PUBLIC_DATASETS_DIR}/dfki-sentinel-eurosat" "${DATASETS_DIR}/dfki-sentinel-eurosat"
+
 # pre-install the correct version of optimum for this release
 python -m pip install "optimum-graphcore>=0.5, <0.6"
 
diff --git a/.gradient/settings.yaml b/.gradient/settings.yaml
index 3a02448..9c93cc5 100644
--- a/.gradient/settings.yaml
+++ b/.gradient/settings.yaml
@@ -40,3 +40,32 @@ integrations:
   dfki-sentinel-eurosat:
     type: dataset
     ref: paperspace/ds8p6sv96fl1att:k5j4cob
+  bert-base-uncased-sst2:
+    type: dataset
+    ref: paperspace/dskrqljie6pti8y:mfqq5qk
+  bert-base-uncased-go_emotions:
+    type: dataset
+    ref: paperspace/dsz2f8usk60xbos:n3h8ko3
+  bert-base-uncased-squad:
+    type: dataset
+    ref: paperspace/ds9ogwc0fbfh799:3mv59lg
+  packed_bert_slseqcls_exe_cache:
+    type: dataset
+    ref: paperspace/dsfg0gcuqbr0pfc:0pss84k
+  packed_bert_mlseqcls_exe_cache:
+    type: dataset
+    ref: paperspace/dsevh3ol36qzpz2:1yme9yi
+  packed_bert_qa_exe_cache:
+    type: dataset
+    ref: paperspace/dsson0ib8byvqpf:tcgts2v
+  packed_bert_slseqcls_dataset_cache:
+    type: dataset
+    ref: paperspace/dsuuz3dih9su40i:npvb833
+  packed_bert_mlseqcls_dataset_cache:
+    type: dataset
+    ref: paperspace/dsuxwz4nqbbs07s:jipm3jh
+  packed_bert_qa_dataset_cache:
+    type: dataset
+    ref: paperspace/dssvktzrzcoaumk:5zhp5mf
+
+
diff --git a/packed-bert/__init__.py b/packed-bert/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/packed-bert/images/go_emotions.png b/packed-bert/images/go_emotions.png
new file mode 100644
index 0000000..2d9da10
Binary files /dev/null and b/packed-bert/images/go_emotions.png differ
diff --git a/packed-bert/models/__init__.py b/packed-bert/models/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/packed-bert/models/modeling_bert_packed.py b/packed-bert/models/modeling_bert_packed.py
new file mode 100644
index 0000000..81dabdd
--- /dev/null
+++ b/packed-bert/models/modeling_bert_packed.py
@@ -0,0 +1,282 @@
+# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+import poptorch
+from optimum.graphcore.models.bert.modeling_bert import BertPipelineMixin
+from transformers import BertForQuestionAnswering, BertForSequenceClassification
+from transformers.modeling_outputs import QuestionAnsweringModelOutput
+
+
+class PackedBertPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.max_seq_per_pack = config.max_sequences_per_pack
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        """
+        We "pool" the model by simply taking the hidden states corresponding
+        to the last max_sequences_per_pack tokens. Note that the [CLS] tokens
+        are always located at the end of the pack. When the actual number of
+        sequences is lower than max_sequences_per_pack, we still slice out
+        the last max_sequences_per_pack tokens, but we will not use all of
+        them during loss calculation.
+        """
+        sh = hidden_states.shape
+        last_tokens_tensors = hidden_states[:, -self.max_seq_per_pack :]
+        last_reshape = last_tokens_tensors.reshape(sh[0] * self.max_seq_per_pack, sh[2])
+        # output size: [bs x max_sequences_per_pack, hidden_size]
+        output = self.dense(last_reshape)
+        output = self.activation(output)
+
+        return output
+
+
+class PackedBertOutputsForMultiLabel(nn.Module):
+    """
+    This class handles the custom model output phase for multi-label sequence classification.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.max_seq_per_pack = config.max_sequences_per_pack
+        self.multi_loss = torch.nn.BCEWithLogitsLoss(reduction="none")
+
+    def forward(
+        self,
+        outputs: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        batch_dim: int,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor]:
+        max_labels = torch.max(attention_mask[:, : -self.max_seq_per_pack], dim=-1).values.unsqueeze(1)
+
+        # Create a mask corresponding to actual number of seqs in pack, to mask padding
+        label_mask = torch.arange(0, self.max_seq_per_pack).unsqueeze(0).repeat(batch_dim, 1)
+        label_mask = torch.where(
+            label_mask < max_labels,
+            torch.ones(batch_dim, self.max_seq_per_pack),
+            torch.zeros(batch_dim, self.max_seq_per_pack),
+        )
+        label_mask = label_mask.view(-1).unsqueeze(1)
+
+        # Adjust logits to rule out padding
+        logits = label_mask * outputs.logits
+
+        loss = None
+        if labels is not None:
+            # Flatten and adjust labels to rule out padding
+            labels = labels.view(-1, *(labels.size()[2:])).to(torch.float32)
+            labels = label_mask * labels
+
+            # Adjust the loss to rule out the padding and CLS logits
+            loss = self.multi_loss(logits, labels)
+            loss *= label_mask
+
+            # Take mean over each multi-class pred
+            loss = torch.sum(loss) / (torch.sum(max_labels) * labels.shape[-1])
+            loss = poptorch.identity_loss(loss, reduction="none")
+
+            logits = logits.reshape([batch_dim, self.max_seq_per_pack, logits.shape[-1]])
+
+            return (loss, logits)
+        else:
+            return logits
+
+
+class PipelinedPackedBertForSequenceClassification(BertForSequenceClassification, BertPipelineMixin):
+    """
+    This class supports doing single-label/multi-label sequence-classification tasks with custom outputs.
+    The problem_type must be passed to differentiate the two methods - multi_label_classification or single_label_classification. Multi-label requires a custom loss implementation to mask labels and logits, unlike single-label.
+
+    In both cases:
+        * The logits need to be reshaped at output to revert them from the 'unpacked' batch dimension to a batch dimension equivalent to that of the labels passed to the model in order for Optimum's trainer class to perform evaluation.
+
+        * The attention mask is reshaped from the 'packed' attention mask to an equivalent binary 3D "extended" attention mask for BERT to recognise the sequences within a single packed input as unrelated sequences.
+    """
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.max_seq_per_pack = config.max_sequences_per_pack
+        self.problem_type = config.problem_type
+        self.num_labels = config.num_labels
+
+        self.bert.pooler = PackedBertPooler(config)
+        self.multi_label_outputs = PackedBertOutputsForMultiLabel(config)
+
+    def parallelize(self):
+        super().parallelize()
+        last_ipu = self.ipu_config.ipus_per_replica - 1
+        self.classifier = poptorch.BeginBlock(self.classifier, "Classifier Output", ipu_id=last_ipu)
+        return self
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Tuple[torch.Tensor]:
+        bs = input_ids.shape[0]
+        seq_len = input_ids.shape[1]
+
+        attention_mask_3d = attention_mask[:, None, :].repeat(1, seq_len, 1)
+        attention_mask_3d = (attention_mask_3d == attention_mask_3d.transpose(1, 2)) * (attention_mask_3d != 0)
+
+        # Manual masking of logits and loss only needed for multi-label, single-label loss allows ignore_index
+        output = super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask_3d,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            labels=labels if labels is not None and self.problem_type == "single_label_classification" else None,
+        )
+
+        if self.problem_type == "single_label_classification":
+            if labels is not None:
+                logits = output.logits.reshape([-1, self.max_seq_per_pack, self.num_labels])
+                output.logits = logits
+                output = (output.loss, output.logits)
+            else:
+                output = output.logits
+
+        else:
+            output = self.multi_label_outputs(
+                outputs=output, attention_mask=attention_mask, batch_dim=bs, labels=labels
+            )
+
+        return output
+
+
+class PackedBertOutputsForQA(nn.Module):
+    """
+    This class handles the custom output phase for a question-answering task.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        # Use the default QA model output formatting class to return outputs in the same form as the base model.
+        self.output = QuestionAnsweringModelOutput
+        self.max_sequences_per_pack = config.max_sequences_per_pack
+
+    def forward(
+        self,
+        final_layer_output: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
+        # Create unpacking mask to separate packed logits out into sequence-specific logits only
+        unpacking_mask = attention_mask[:, None, :].repeat(1, self.max_sequences_per_pack, 1)
+        pack_seq_ids = torch.arange(1, self.max_sequences_per_pack + 1).view(self.max_sequences_per_pack, 1)
+
+        unpacking_mask = unpacking_mask == pack_seq_ids
+
+        # Expand start logits using mask to isolate logits for each internal sequence in the pack
+        unpacked_start_logits = final_layer_output.start_logits[:, None, :] * unpacking_mask
+        unpacked_end_logits = final_layer_output.end_logits[:, None, :] * unpacking_mask
+
+        # Calculate loss on logits/labels with initial [bs, mspp, ...] dims collapsed into one [bs*mspp, ...]
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            start_positions = start_positions.view(-1)
+            end_positions = end_positions.view(-1)
+
+            unpacked_start_logits = unpacked_start_logits.contiguous()
+            unpacked_end_logits = unpacked_end_logits.contiguous()
+
+            unpacked_start_logits = unpacked_start_logits.view(-1, unpacked_start_logits.shape[-1])
+            unpacked_end_logits = unpacked_end_logits.view(-1, unpacked_end_logits.shape[-1])
+
+            loss_fct = nn.CrossEntropyLoss()
+            start_loss = loss_fct(unpacked_start_logits, start_positions)
+            end_loss = loss_fct(unpacked_end_logits, end_positions)
+
+            total_loss = (start_loss + end_loss) / 2
+
+        return self.output(
+            loss=total_loss,
+            start_logits=unpacked_start_logits,
+            end_logits=unpacked_end_logits,
+            hidden_states=final_layer_output.hidden_states,
+            attentions=final_layer_output.attentions,
+        )
+
+
+class PipelinedPackedBertForQuestionAnswering(BertForQuestionAnswering, BertPipelineMixin):
+    """
+    This class extends BertForQuestionAnswering with some differences required for packing. The 'packed' attention mask must be extended to a 3D binary "extended" attention mask for BERT to recognise the sequences within a single packed input as unrelated sequences. The output is extended to enable masking for padded labels, and then 'unpacking' the packed hidden state output before performing the loss calculation.
+    """
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.max_seq_per_pack = self.config.max_sequences_per_pack
+        self.packed_outputs = PackedBertOutputsForQA(config)
+
+    def parallelize(self):
+        super().parallelize()
+        last_ipu = self.ipu_config.ipus_per_replica - 1
+        self.qa_outputs = poptorch.BeginBlock(self.qa_outputs, "QA Outputs", ipu_id=last_ipu)
+        return self
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Tuple[torch.Tensor]:
+        # Create 3D attention mask for sequence specific attention in pack
+        seq_len = input_ids.shape[1]
+        packed_attention_mask = attention_mask[:, None, :].repeat(1, seq_len, 1)
+        packed_attention_mask = (packed_attention_mask == packed_attention_mask.transpose(1, 2)) * (
+            packed_attention_mask != 0
+        )
+
+        # Run forwards pass through model without labels
+        final_layer_output = super().forward(
+            input_ids, attention_mask=packed_attention_mask, token_type_ids=token_type_ids, position_ids=position_ids
+        )
+
+        # Custom PackedBert for SQuAD output, redirect from before loss function in transformers model class.
+        output = self.packed_outputs(
+            final_layer_output,
+            attention_mask=attention_mask,
+            start_positions=start_positions,
+            end_positions=end_positions,
+        )
+
+        if start_positions is not None and end_positions is not None:
+            return poptorch.identity_loss(output.loss, reduction="mean"), output.start_logits, output.end_logits
+        else:
+            return output.start_logits, output.end_logits
diff --git a/packed-bert/packedBERT_multi_label_text_classification.ipynb b/packed-bert/packedBERT_multi_label_text_classification.ipynb
new file mode 100644
index 0000000..70f611b
--- /dev/null
+++ b/packed-bert/packedBERT_multi_label_text_classification.ipynb
@@ -0,0 +1,1293 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "X4cRE8IbIrIV"
+   },
+   "source": [
+    "First of all, make sure your environment has installed the latest version of [🤗 Optimum Graphcore](https://github.com/huggingface/optimum-graphcore)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 1000
+    },
+    "id": "MOsHUjgdIrIW",
+    "outputId": "f84a093e-147f-470e-aad9-80fb51193c8e"
+   },
+   "outputs": [],
+   "source": [
+    "%pip install git+https://github.com/huggingface/optimum-graphcore.git"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Also make sure all the packages required for this notebook are installed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install scikit-learn;\n",
+    "%pip install datasets\n",
+    "%pip install evaluate\n",
+    "%pip install tokenizers\n",
+    "%pip install matplotlib\n",
+    "%pip install scipy\n",
+    "%pip install --force-reinstall huggingface_hub==0.11.1;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's start by importing and printing out the versions of `Transformers` and `Optimum Graphcore`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import transformers\n",
+    "import optimum.graphcore\n",
+    "\n",
+    "print(transformers.__version__)\n",
+    "print(optimum.graphcore.__version__)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "At the end of this notebook, to be able to share your model with the community and easily access it through HuggingFace, there are some short set-up steps you must follow to enable uploading your checkpoint to the HuggingFace Hub.\n",
+    "\n",
+    "First you have to store your authentication token from the Hugging Face website ([sign up here](https://huggingface.co/join) if you haven't already!) then execute the following cell and input your username and password:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import notebook_login\n",
+    "\n",
+    "notebook_login()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Git-lfs must also be installed to enable large file storage when pushing to the hub:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "!apt install git-lfs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "rEJBSTyZIrIb"
+   },
+   "source": [
+    "# Faster multi-label text classification with PackedBERT"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This notebook builds on the process of fine-tuning BERT on a [text classification task](text_classification.ipynb) showing how to implement packing for BERT for multi-label classification. [Packing](https://www.graphcore.ai/posts/introducing-packed-bert-for-2x-faster-training-in-natural-language-processing) is an optimisation method originally used for 2x faster BERT pre-training, which can now also provide massive throughput increases for **fine-tuning** and **batched inference**! \n",
+    "\n",
+    "**So, what *is* packing?** The basic idea of 'packing' a dataset is to utilise the requirement for constant-shaped inputs into a model. Instead of padding it with empty, unused space, we can recycle this unused space and fill it with more inputs! The architecture of transformer models like BERT supports this, and lets us optimally use this space to process multiple sequences within one input.\n",
+    "\n",
+    "**And here is why you might want to use it:** Having a single input contain multiple sequences leads to multiple sequences being processed in parallel in a single pass within a single iteration inside a batch, increasing the 'effective' batch size of the model by a considerable factor in many cases, and most importantly, increasing model throughput for training and batched inference significantly.\n",
+    "\n",
+    "The [GoEmotions](https://ai.googleblog.com/2021/10/goemotions-dataset-for-fine-grained.html) dataset will be fine-tuned using packing. This notebook outlines how to easily enable packing for BERT when performing fine-tuning/inference on a text-classification task in 🤗 Optimum, resulting in an impressive 5-9x faster training and inference run-time for the dataset. \n",
+    "\n",
+    "You can read more about packing in the original [paper](https://arxiv.org/abs/2107.02027)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![GoEmotions dataset (Source: GoogleBlog)](../images/go_emotions.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The dataset consists of 58k comments labelled for 27 different emotion categories (and a 28th \"neutral\" category). This dataset is used for multi-label, multi-class classification. The dataset format and categories can be viewed on the [Huggingface Hub](https://huggingface.co/datasets/go_emotions)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's initialise our training configurations. \n",
+    "\n",
+    "In this notebook, we are using both data parallelism and pipeline parallelism (see this [tutorial](https://github.com/graphcore/tutorials/blob/master/tutorials/pytorch/efficient_data_loading/walkthrough.ipynb) for more). Therefore the global batch size, which is the actual number of samples used for the weight update, is determined using four factors:\n",
+    "\n",
+    "    global batch size = micro_batch_size * gradient accumulation steps * device iterations * replication factor\n",
+    "\n",
+    "and replication factor is determined by pod_type, which will be used as a key to select the replication factor from a dictionary defined in the IPU config file. For example, the dictionary in the IPU config file Graphcore/roberta-base-ipu looks like this:\n",
+    "\n",
+    "    \"replication_factor\": {\"pod4\": 1, \"pod8\": 2, \"pod16\": 4, \"pod32\": 8, \"pod64\": 16, \"default\": 1}\n",
+    "\n",
+    "Depending on your model and the pod machine you are using, you might need to adjust these three batch-size-related arguments.\n",
+    "\n",
+    "By default this notebook is configured to run on 4 IPUs.\n",
+    "\n",
+    "Finally, `max_seq_length` is the maximum length a sequence can be, and all sequences will be padded to this length, so it should not be larger than the maximum length of the model. \n",
+    "\n",
+    "Given the small size of the sequences in go-emotions, we can reduce the model maximum input size to `max_seq_length = 256`. Set these parameters and the rest of the notebook should run smoothly:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "zVvslsfMIrIh",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "model_checkpoint = \"bert-base-uncased\" # Default uncased pre-trained BERT checkpoint\n",
+    "ipu_config_name = \"Graphcore/bert-base-uncased\" # Default Graphcore IPU config initialisation for pre-trained BERT\n",
+    "max_seq_length = 256 # The maximum sequence length allowed for sequences in the model.\n",
+    "micro_batch_size = 2 \n",
+    "gradient_accumulation_steps = 39\n",
+    "device_iterations = 32\n",
+    "model_task = 'go_emotions'\n",
+    "num_labels = 28"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Gradients are not calculated during validation, so gradient accumulation is not applicable, and the global batch size for validation can be defined separately as:\n",
+    "```\n",
+    "global_validation_batch_size=device_iterations*replication_factor*max_seq_per_pack\n",
+    "```\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Values for machine size and cache directories can be configured through environment variables or directly in the notebook:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "pod_type = os.getenv(\"GRAPHCORE_POD_TYPE\", \"pod4\")\n",
+    "executable_cache_dir = os.getenv(\"POPLAR_EXECUTABLE_CACHE_DIR\", \"/tmp/\") + \"packed_bert_mlseqcls_exe_cache/\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "whPRbBNbIrIl"
+   },
+   "source": [
+    "## Loading the dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "W7QYTpxXIrIl"
+   },
+   "source": [
+    "We will use the [🤗 Datasets](https://github.com/huggingface/datasets) library to download the data and get the metric we need to use for evaluation (to compare our model to the benchmark). This can be easily done with the functions `load_dataset` and `load_metric`.  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "IreSlFmlIrIm",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "import evaluate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "dataset = load_dataset(model_task, cache_dir=os.getenv(\"POPLAR_EXECUTABLE_CACHE_DIR\", \"/tmp/\") + \"packed_bert_mlseqcls_dataset_cache/\")\n",
+    "metric = evaluate.load(\"roc_auc\", \"multilabel\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "RzfPtOMoIrIu"
+   },
+   "source": [
+    "The `dataset` object itself is [`DatasetDict`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasetdict), which contains one key for the training, validation and test set (with more keys for the mismatched validation and test set in the special case of `mnli`)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "GWiVUF0jIrIv",
+    "outputId": "35e3ea43-f397-4a54-c90c-f2cf8d36873e",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "u3EtYfeHIrIz"
+   },
+   "source": [
+    "To access an actual element, you need to select a split first, then give an index:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "X6HrpprwIrIz",
+    "outputId": "d7670bc0-42e4-4c09-8a6a-5c018ded7d95",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "dataset[\"train\"][0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "WHUmphG3IrI3"
+   },
+   "source": [
+    "To get a sense of what the data looks like, the following function will show some examples picked randomly in the dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "i3j8APAoIrI3",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import datasets\n",
+    "import random\n",
+    "import pandas as pd\n",
+    "from IPython.display import display, HTML\n",
+    "\n",
+    "def show_random_elements(dataset, num_examples=10):\n",
+    "    assert num_examples <= len(dataset), \"Can't pick more elements than there are in the dataset.\"\n",
+    "    picks = []\n",
+    "    for _ in range(num_examples):\n",
+    "        pick = random.randint(0, len(dataset)-1)\n",
+    "        while pick in picks:\n",
+    "            pick = random.randint(0, len(dataset)-1)\n",
+    "        picks.append(pick)\n",
+    "    \n",
+    "    df = pd.DataFrame(dataset[picks])\n",
+    "    for column, typ in dataset.features.items():\n",
+    "        if isinstance(typ, datasets.ClassLabel):\n",
+    "            df[column] = df[column].transform(lambda i: typ.names[i])\n",
+    "    display(HTML(df.to_html()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "SZy5tRB_IrI7",
+    "outputId": "ba8f2124-e485-488f-8c0c-254f34f24f13",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "show_random_elements(dataset[\"train\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "lnjDIuQ3IrI-"
+   },
+   "source": [
+    "The metric is an instance of [`datasets.Metric`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Metric):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "5o4rUteaIrI_",
+    "outputId": "18038ef5-554c-45c5-e00a-133b02ec10f1",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "metric"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "jAWdqcUBIrJC"
+   },
+   "source": [
+    "You can call its `compute` method with your predictions and labels directly and it will return a dictionary with the metric(s) value:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "n9qywopnIrJH"
+   },
+   "source": [
+    "## Preprocessing the data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "YVx71GdAIrJH"
+   },
+   "source": [
+    "Before we can feed the texts to our model, we need to preprocess them. This is done by a 🤗 Transformers `Tokenizer` which will (as the name indicates) tokenize the inputs (including converting the tokens to their corresponding IDs in the pretrained vocabulary), putting them into a format the model expects, as well as generate the other inputs that model requires.\n",
+    "\n",
+    "To do all of this, we instantiate our tokenizer with the `AutoTokenizer.from_pretrained` method, which will ensure:\n",
+    "\n",
+    "- we get a tokenizer that corresponds to the model architecture we want to use,\n",
+    "- we download the vocabulary used when pretraining this specific checkpoint.\n",
+    "\n",
+    "That vocabulary will be cached, so it's not downloaded again the next time we run the cell."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "eXNLu_-nIrJI",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "    \n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Vl6IidfdIrJK"
+   },
+   "source": [
+    "We pass along `use_fast=True` to the call above to use one of the fast tokenizers (backed by Rust) from the 🤗 Tokenizers library. Those fast tokenizers are available for almost all models, but if you got an error with the previous call, remove that argument."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "rowT4iCLIrJK"
+   },
+   "source": [
+    "You can directly call this tokenizer on one sentence or a pair of sentences:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "a5hBlsrHIrJL",
+    "outputId": "acdaa98a-a8cd-4a20-89b8-cc26437bbe90",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "tokenizer(\"Hello, this one sentence!\", \"And this sentence goes with it.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "qo_0B1M2IrJM"
+   },
+   "source": [
+    "Depending on the model you selected, you will see different keys in the dictionary returned by the cell above. They don't matter much for what we're doing here (just know they are required by the model we will instantiate later), you can learn more about them in [this tutorial](https://huggingface.co/transformers/preprocessing.html) if you're interested.\n",
+    "\n",
+    "To preprocess our dataset, we will need the names of the columns containing the sentence(s). In this case, the column is called `'text'` and it is indexed as such in the tokenization function."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "2C0hcmp9IrJQ"
+   },
+   "source": [
+    "We can then write the function that will preprocess our samples. We just feed them to the `tokenizer` with the three arguments.`truncation=True` will ensure that an input longer than maximum length will be truncated to the maximum length. `max_length=max_seq_length` sets the maximum length of a sequence.\n",
+    "\n",
+    "**Note that since we use packing later, we don't set any padding in the tokenizer.**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "vc0BSBLIIrJQ",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# no padding for packing\n",
+    "def preprocess_function(examples):\n",
+    "    return tokenizer(examples['text'], truncation=True, max_length=max_seq_length)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For multi-label classification, we also need to convert our labels from integer values indicating a category to an N-hot binary format (where N is the maximum number of labels. This makes sure we have constant sized labels, and all of our labels (one input can have multiple target labels) are present for training. The conversion looks something like this:\n",
+    "\n",
+    "```python\n",
+    "unprocessed_labels = [3,21] # Where 3 and 21 are label categories\n",
+    "preprocessed_labels = id_to_N_hot([3,21])\n",
+    "preprocessed_labels = [0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0]\n",
+    "```\n",
+    "\n",
+    "\n",
+    "The following function processes one example and converts it to N-hot - the `.map()` functionality available in the `datasets` library allows the function to be applied easily to the entire dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "def id_to_N_hot(example):\n",
+    "    indexes = example['labels']\n",
+    "    label = np.zeros((num_labels,), dtype=int)\n",
+    "    for idx in indexes:\n",
+    "        label[idx] = 1\n",
+    "    example['labels'] = label\n",
+    "    return example"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "zS-6iXTkIrJT"
+   },
+   "source": [
+    "To apply this function on all the sentences (or pairs of sentences) in our dataset, we just use the `map` method of our `dataset` object we created earlier. This will apply the function on all the elements of all the splits in `dataset`, so our training, validation and testing data will be preprocessed in one single command."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "DDtsaJeVIrJT",
+    "outputId": "aa4734bf-4ef5-4437-9948-2c16363da719",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "encoded_dataset = dataset.map(id_to_N_hot)\n",
+    "encoded_dataset = encoded_dataset.map(preprocess_function, batched=True)\n",
+    "\n",
+    "len(encoded_dataset['validation'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "voWiw8C7IrJV"
+   },
+   "source": [
+    "Even better, the results are automatically cached by the 🤗 Datasets library to avoid spending time on this step the next time you run your notebook. The 🤗 Datasets library is normally smart enough to detect when the function you pass to map has changed (and thus requires to not use the cache data). For instance, it will properly detect if you change the task in the first cell and rerun the notebook. 🤗 Datasets warns you when it uses cached files, you can pass `load_from_cache_file=False` in the call to `map` to not use the cached files and force the preprocessing to be applied again.\n",
+    "\n",
+    "Note that we passed `batched=True` to encode the texts by batches together. This is to leverage the full benefit of the fast tokenizer we loaded earlier, which will use multi-threading to treat the texts in a batch concurrently."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##  Packing the dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To implement packing, we need to pack our dataset first. Each new element will be a \"pack\" containing at most `max_seq_per_pack` sequences."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "max_seq_per_pack = 6"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The problem type for this task is multi_label_classification, this also needs to be defined for the packed model to work."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "problem_type = 'multi_label_classification'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Packing algorithm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In order to pack efficiently, we will use a histogram-based algorithm: shortest-pack-first histogram packing (SPFHP) presented in the [blog post](https://www.graphcore.ai/posts/introducing-packed-bert-for-2x-faster-training-in-natural-language-processing) adapted from the [blog code](https://github.com/graphcore/tutorials/tree/master/blogs_code/packedBERT). The full process of packing the dataset consists of four steps:\n",
+    "\n",
+    "1. Create a histogram of the sequence lengths of the dataset.\n",
+    "2. Generate the 'strategy' for the dataset using one of the state-of-the-art packing algorithms, which maps out the order and indices of the sequences that need to be packed together.\n",
+    "3. Use this strategy to create the actual dataset, concatenating the tokenized features together for each column in the dataset, including the labels.\n",
+    "4. Finally, pass these new columns into a custom PyTorch dataset, ready to be passed to the PopTorch dataloader!\n",
+    "\n",
+    "These steps have been simplified through the easy-to-use `packing_utils` available in Graphcore Optimum. You can simply generate the packed dataset after the usual tokenization and preprocessing by passing all necessary packing configuration to the `PackedDatasetCreator` class, and generate the ready-to-use PyTorch dataset with `.create()`.\n",
+    "\n",
+    "Within the function, there are some column names used by default. The expected default columns for text classification include:\n",
+    "* `input_ids`\n",
+    "* `attention_mask`\n",
+    "* `token_type_ids`\n",
+    "* `labels`\n",
+    "\n",
+    "These should all be generated automatically when tokenizing any classification dataset for BERT. However, the labels key, as it is not encoded, may have a different name. For this dataset, the column key for the labels for this dataset is `labels`, we can pass this to the argument `custom_label_key`, so the class can find our labels. \n",
+    "\n",
+    "The `PackedDatasetCreator` requires different instantiations for different datasets, so it must be called separately for each of our dataset splits. We can set either `training`, `validation` or `inference` to `True` as needed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from utils.packing.dataset_creator import PackedDatasetCreator\n",
+    "\n",
+    "train_data_packer = PackedDatasetCreator(\n",
+    "    tokenized_dataset = encoded_dataset['train'],\n",
+    "    max_sequence_length = max_seq_length,\n",
+    "    max_sequences_per_pack = max_seq_per_pack,\n",
+    "    training = True,\n",
+    "    num_labels = num_labels,\n",
+    "    problem_type = problem_type,\n",
+    "    algorithm = 'SPFHP',\n",
+    "    custom_label_key = 'labels'\n",
+    ")\n",
+    "\n",
+    "val_data_packer = PackedDatasetCreator(\n",
+    "    tokenized_dataset = encoded_dataset['validation'],\n",
+    "    max_sequence_length = max_seq_length,\n",
+    "    max_sequences_per_pack = max_seq_per_pack,\n",
+    "    validation = True,\n",
+    "    num_labels = num_labels,\n",
+    "    problem_type = problem_type,\n",
+    "    algorithm = 'SPFHP',\n",
+    "    custom_label_key = 'labels'\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This will create the strategy and initialise the necessary parameters for packing the dataset. We can see that the ideal speed-up we have achieved is approximately 5.7x the original dataset, which corresponds directly to the average packing factor: the average number of sequences within one pack.\n",
+    "\n",
+    "The `PackedDatasetCreator` class also has some other features we do not use here for training, such as `pad_to_global_batch_size`, a feature useful for performing batched inference on a large samples when we do not want to lose any of the samples when creating data iterators using the `poptorch.Dataloader`, it applies 'vertical' padding to the dataset, adding filler rows to bring the dataset up to a value divisible by the global batch size, and allows for the largest possible batch sizes to be used without any loss of data.\n",
+    "\n",
+    "You can also view the histogram generated in the first step of the packing process, to observe whether the distribution of sequence lengths in the dataset will benefit from packing - as a general rule, as long as the average length of the sequences in the dataset is 50% or less of the maximum sequence length, packing will offer at least a 2x throughput benefit, in other words: `throughput_increase ≈ max_seq_len/mean_seq_len`\n",
+    "\n",
+    "Many datasets have distributions with much smaller average lengths, and will benefit much more. We can easily observe this distribution by retrieving and plotting the histogram from the data class:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "train_histogram = train_data_packer.histogram\n",
+    "\n",
+    "plt.hist(train_histogram, bins = [k for k in range(0,max_seq_length,10)]) \n",
+    "plt.title(\"Sequence length histogram\") \n",
+    "plt.xlabel('Sequence lengths')\n",
+    "plt.ylabel('Frequency')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we need to create the actual packed dataset, this is the 3rd step of the packing process outlined above.\n",
+    "\n",
+    "In this stage, we take the strategy for mapping the sequences by size into 'packs' that was generated by the packing algorithm, and use this to extract the sequences from the tokenized dataset, inserting them into packs for each column in the dataset. Any remaining space in a pack after the sequences have been concatenated is padded to bring all sequences up to the maximum sequence length.\n",
+    "\n",
+    "Some key features unique to packed datasets are worth mentioning here:\n",
+    "\n",
+    "- A specific `attention_mask` is generated: It contains a unique index for each sequence of the pack and `0` for the remaining padding tokens. This, essentially, tells the model where to \"look\" from the perspective of a single token, ignoring any encoded information (such as a different sequence) that is not relevant to that token.\n",
+    "    - Example of 3 sequences: `attention_mask = [1,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,0,...,0,1,2,3]`\n",
+    "\n",
+    "\n",
+    "- The [CLS] tokens of each sequence must be moved to the end of the pack.\n",
+    "    - For instance: `[CLS,a,b,c] + [CLS, d,e,f] + [CLS, g,h,i] -> [a,b,c,d,e,f,g,h,i,...,CLS,CLS,CLS]`\n",
+    "    \n",
+    "\n",
+    "- The `position_ids` of a pack contain the concatenated `position_ids` of each sequences \n",
+    "    - For instance given 3 sequences: `[0,1,2,3,4] + [0,1,2,3] + [0,1,2] -> [1,2,3,4,1,2,3,1,2,...,0,0,0]` (note: the CLS tokens position id '0' are also moved the end of the pack)\n",
+    "    \n",
+    "- `labels` and `token_type_ids` are also packed to correspond to the `input_ids` pack.\n",
+    "\n",
+    "\n",
+    "To create a dataloader-ready packed dataset, all you need to do is call the `create()` method:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "packed_train_dataset = train_data_packer.create()\n",
+    "packed_val_dataset = val_data_packer.create()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's visualize one sample of the new `packed_train_dataset`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "packed_train_dataset[133]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Fine-tuning the model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now that our data is ready, we can download the pretrained model and fine-tune it."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Implementing Packed BERT\n",
+    "\n",
+    "A few model modifications are required to make packing work with BERT.\n",
+    "We extend the existing class `BertForSequenceClassification` to `PipelinedPackedBertForSequenceClassification` which incorporates the required changes to the pooler and the model output. The crux of these changes is to modify the generic sequence classification model to handle 'unpacking' multiple sequences in the output stage, treating them as a larger batch size for classification, as well as masking any padding created by packing.\n",
+    "\n",
+    "First let's load a default BERT configuration using `AutoConfig`. The config includes a new parameter we must set, `max_sequences_per_pack`, this informs the model of the maximum number of sequences it will need to 'unpack' in the model output. It also allows us to clearly define the `num_labels` and `problem_type` for this model.\n",
+    "\n",
+    "The problem type is essential to define here, as switching between methods used by different types of classification requires it within the custom model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from transformers import AutoConfig\n",
+    "\n",
+    "config = AutoConfig.from_pretrained(model_checkpoint)\n",
+    "config.max_sequences_per_pack = max_seq_per_pack\n",
+    "config.num_labels = num_labels\n",
+    "config.problem_type = problem_type"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we can instantiate the model class with the config, loading the weights from the model checkpoint."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true,
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import numpy as np\n",
+    "torch.manual_seed(43)\n",
+    "np.random.seed(43)\n",
+    "\n",
+    "from models.modeling_bert_packed import PipelinedPackedBertForSequenceClassification\n",
+    "\n",
+    "\n",
+    "model = PipelinedPackedBertForSequenceClassification(config).from_pretrained(\n",
+    "   model_checkpoint, config=config)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The warning is telling us we are throwing away some weights and randomly initializing others. This is absolutely normal in this case, because we are removing the head used to pretrain the model on a masked language modeling objective and replacing it with a new head for sequence classification, which we don't have pretrained weights, so the library warns us we should fine-tune this model before using it for inference, which is exactly what we are going to do.\n",
+    "\n",
+    "We can first test the model on CPU and observe that the output logits have the size `[batch_size, max_seq_per_pack, 2] = [1, 6, 28]` with this notebook's default values, and the 28 labels for the dataset. The logits are reshaped into this form in the model output, to be the same shape as the labels, for ease of postprocessing."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# test the model on CPU\n",
+    "from transformers.data.data_collator import default_data_collator\n",
+    "\n",
+    "loader = torch.utils.data.DataLoader(packed_train_dataset,\n",
+    "                             batch_size=1,\n",
+    "                             shuffle=True,\n",
+    "                             drop_last=True,\n",
+    "                             collate_fn=default_data_collator)\n",
+    "data = next(iter(loader))\n",
+    "labels = data['labels']\n",
+    "\n",
+    "print('labels: ', labels.shape)\n",
+    "o = model(**data)\n",
+    "print('outputs (loss, logits): ', o[0], o[1].shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, let's prepare the model for IPU"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First, we set the model in half precision:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true,
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "model.half()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For validation, we need to define a function to compute the metrics from the predictions, which will just use the `metric` we loaded earlier, preprocessing here involves a step to mask the labels and predictions we are not using, set to a `-100` value when creating the dataset, with a boolean mask. Then, the predictions are passed into a softmax function to determine the probabilities of each class, as this is a multi-label task. \n",
+    "\n",
+    "These predictions and labels are passed into the metric function to compute the accuracy during evaluation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "model_name = model_checkpoint.split(\"/\")[-1]\n",
+    "from scipy.special import softmax\n",
+    "from tqdm import tqdm\n",
+    "def compute_metrics(eval_pred):\n",
+    "    predictions, labels = eval_pred\n",
+    "    \n",
+    "    labels = labels.reshape(-1, labels.shape[-1])\n",
+    "    predictions = predictions.reshape(-1, predictions.shape[-1])\n",
+    "    \n",
+    "    # Remove the padding labels\n",
+    "    mask = (labels != -100)[:,0]\n",
+    "    \n",
+    "    labels = labels[mask,:]\n",
+    "    predictions = predictions[mask,:]\n",
+    "    pred_scores = softmax(predictions.astype(\"float32\"), axis=1)    \n",
+    "\n",
+    "    auc = metric.compute(\n",
+    "        prediction_scores=pred_scores, references=labels, multi_class=\"ovr\"\n",
+    "    )[\"roc_auc\"]\n",
+    "\n",
+    "    return {\"roc_auc\": auc}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, we need to define the `IPUConfig`, which is a class that specifies attributes and configuration parameters to compile and put the model on the device. We initialize it with one config name or path, which we set earlier. Then we use it to set the mode attribute `model.ipu_config` "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from optimum.graphcore import IPUConfig, IPUTrainer, IPUTrainingArguments\n",
+    "\n",
+    "ipu_config = IPUConfig.from_pretrained(\n",
+    "    ipu_config_name,\n",
+    "    executable_cache_dir = executable_cache_dir,\n",
+    "    gradient_accumulation_steps=gradient_accumulation_steps,\n",
+    "    device_iterations = device_iterations,\n",
+    "    replication_factor=1,\n",
+    "    inference_device_iterations = 64,\n",
+    "    inference_replication_factor = 1\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The IPUTrainingArguments define any custom parameter modification we want to do, such as the initial learning rate for the model. It also allows other options, such as dataloader parameters, micro batch sizes and an automatic push to the Huggingface Hub (if credentials were set up earlier) to happen at given intervals.\n",
+    "\n",
+    "These arguments are passed to the `IPUTrainer` which wraps the model training and evaluation process into a simple single-line process, doing all of the heavy lifting for us regarding training and evaluation loops, device assignment, optimiser definition, dataloading etc.\n",
+    "\n",
+    "Note that only some arbitrary hyperparameter tuning was performed for this task. Other tasks and datasets may require further tuning to get the most optimal results."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true,
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from transformers import default_data_collator\n",
+    "metric_name = \"roc_auc\"\n",
+    "\n",
+    "args = IPUTrainingArguments(\n",
+    "    \"./\"+f\"{model_name}-{model_task}\",\n",
+    "    per_device_train_batch_size=micro_batch_size,\n",
+    "    per_device_eval_batch_size=4,\n",
+    "    num_train_epochs=5,\n",
+    "    learning_rate=2e-4,\n",
+    "    adam_epsilon=1e-6,\n",
+    "    loss_scaling=16.0,\n",
+    "    warmup_ratio=0.1,\n",
+    "    weight_decay=0,\n",
+    "    lr_scheduler_type = \"cosine\",\n",
+    "    metric_for_best_model=metric_name,\n",
+    "    dataloader_drop_last=True,\n",
+    "    # dataloader_mode=\"async_rebatched\",\n",
+    "    logging_steps=1,\n",
+    "    pod_type=pod_type,\n",
+    "    gradient_accumulation_steps=gradient_accumulation_steps,\n",
+    "    push_to_hub=True    \n",
+    ")\n",
+    "\n",
+    "trainer = IPUTrainer(\n",
+    "    model,\n",
+    "    ipu_config,\n",
+    "    args,\n",
+    "    train_dataset=packed_train_dataset,\n",
+    "    eval_dataset=packed_val_dataset,\n",
+    "    data_collator=default_data_collator,\n",
+    "    compute_metrics=compute_metrics\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then, to train the model we can simply call the `train()` method:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "***About the performance:*** `IPUTrainer` doesn't take into account that we have packed data samples when computing the speed metrics. It treats a 'sample' as a single input to the model, i.e. one **pack**.\n",
+    "\n",
+    "So the actual throughput estimation can be obtained by multiplying the `samples_per_second` by the average packing factor (the average number of samples per pack) of the dataset. These were obtained in the `packing_algorithm` section: `5.68` for the `go-emotions` training set and `5.83` for validation set.\n",
+    "\n",
+    "\n",
+    "Next, we can evaluate the model by simply calling the `evaluate()` method:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "trainer.evaluate()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can now upload the result of the training to the Hub if you successfully logged in at the beginning of this notebook, just execute this instruction:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# trainer.push_to_hub()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can also save the model locally:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "saved_model_checkpoint = Path(os.getenv('CHECKPOINT_DIR', '/tmp/')) + f\"{model_name}-{model_task}\"\n",
+    "\n",
+    "trainer.save_model(saved_model_checkpoint)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You have now successfully fine-tuned and evaluated your speed-optimised model for text classification using packing!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Fast batched inference"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "Packing can also be used for inference, particularly for performing inference for workloads. This section demonstrates how to perform faster, batched inference with a large number of samples using a super-easy custom pipeline which batches and packs your input data, performs inference and returns postprocessed predictions. \n",
+    "\n",
+    "For the pipeline, we need to import it, and initialise a few essential parameters.\n",
+    "\n",
+    "The `model` is the model checkpoint, we are going to use the locally saved checkpoint generated from training `go_emotions`. The `executable_cache_dir`, `problem_type`, `max_seq_length` must be specified. To return predictions organised by class names, the class names for your output must be passed to `label_categories`.  If you are loading a saved model without a pre-trained tokenizer saved in the checkpoint folder, it will be loaded automatically from `bert-base-uncased`, if you wish to load a different pre-trained tokenizer, you can specify this by passing the `pretrained_tokenizer` argument with the name of your tokenizer to the `PackedBertTextClassificationPipeline`.\n",
+    "\n",
+    "The pipeline will automatically determine your model's IPU config, given that the checkpoint was trained using Optimum Graphcore, which will be the case for the model fine-tuned in this notebook.\n",
+    "\n",
+    "In this example, we pre-load the IPUConfig and modify some of the default parameters to get the best performance out of inference and leverage the benefits of IPU parallelism. The micro-batch size can also be specified, for which the default is 1.\n",
+    "\n",
+    "When training, the packing factor affects the convergence the same way as a large increase in batch size would do. However, for inference, we are free to use a bigger packing factor to speed it up. Let's try it with `max_seq_per_pack = 12`.\n",
+    "\n",
+    "**Note:** Packing brings huge benefits for performing inference on large amounts of data. For small scale inference tasks, such as those which more suit sequential inference on a single un-batched input, the generic Optimum Graphcore `TextClassificationPipeline` may be prefered. This won't affect fine-tuning, the weights generated from fine-tuning using packing will work just the same!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Lets list the class names for the GoEmotions dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "class_names = [\n",
+    "    \"admiration\",\n",
+    "    \"amusement\",\n",
+    "    \"anger\",\n",
+    "    \"annoyance\",\n",
+    "    \"approval\",\n",
+    "    \"caring\",\n",
+    "    \"confusion\",\n",
+    "    \"curiosity\",\n",
+    "    \"desire\",\n",
+    "    \"disappointment\",\n",
+    "    \"disapproval\",\n",
+    "    \"disgust\",\n",
+    "    \"embarrassment\",\n",
+    "    \"excitement\",\n",
+    "    \"fear\",\n",
+    "    \"gratitude\",\n",
+    "    \"grief\",\n",
+    "    \"joy\",\n",
+    "    \"love\",\n",
+    "    \"nervousness\",\n",
+    "    \"optimism\",\n",
+    "    \"pride\",\n",
+    "    \"realization\",\n",
+    "    \"relief\",\n",
+    "    \"remorse\",\n",
+    "    \"sadness\",\n",
+    "    \"surprise\",\n",
+    "    \"neutral\",\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Lets initialise the `PackedBertTextClassificationPipeline`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from pipeline.packed_bert import PackedBertTextClassificationPipeline\n",
+    "\n",
+    "from optimum.graphcore import IPUConfig\n",
+    "\n",
+    "model = saved_model_checkpoint\n",
+    "# model = 'your_username/{model_name}-{model_task}' # uncomment this and use your username to load from Hugging Face Hub\n",
+    "\n",
+    "inference_boosted_ipu_config = IPUConfig.from_pretrained(model,                                                       \n",
+    "        inference_device_iterations=32,\n",
+    "        inference_replication_factor=4,\n",
+    "        ipus_per_replica=1,\n",
+    "        layers_per_ipu=[12]\n",
+    "    )\n",
+    "\n",
+    "pipeline = PackedBertTextClassificationPipeline(\n",
+    "    model = model,\n",
+    "    executable_cache_dir = executable_cache_dir,\n",
+    "    problem_type='multi_label_classification',\n",
+    "    max_seq_per_pack=12,\n",
+    "    max_seq_length=max_seq_length,\n",
+    "    ipu_config=inference_boosted_ipu_config,\n",
+    "    micro_batch_size=8,\n",
+    "    label_categories=class_names\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The pipeline expects a **list of strings** directly passed to it. There is no need to tokenize, preprocess, pack or postprocess the data to use the inference pipeline.\n",
+    "\n",
+    "As a test, we can load the entire `sst2` dataset and perform packed inference using `.predict()` on the text column to generate predictions. \n",
+    "\n",
+    "For datasets with multiple sentences, these can simply be passed as `predict(<sentences_1>,<sentences_2>)`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import datasets\n",
+    "dataset = datasets.load_dataset('go_emotions','simplified')\n",
+    "preds = pipeline.predict(dataset['train']['text'])\n",
+    "\n",
+    "print(preds.keys())\n",
+    "print(f\"Number of predictions: {len(preds['predictions'])}\")\n",
+    "print(f\"Preprocessing time: {preds['preprocessing_time']}s\")\n",
+    "print(f\"Postprocessing time: {preds['postprocessing_time']}s\")\n",
+    "print(f\"Throughput: {preds['throughput']} samples/s\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "There is minimal overhead from tokenizing and packing the dataset, but the speed benefits are evident. After increasing the maximum sequences to 12, we can observe a much higher packing factor of 9.14.\n",
+    "\n",
+    "Running the above pipeline, we achieve a throughput approximately 45000 samples per second, demonstrating the huge time benefit you can achieve by using packing!"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "name": "Text Classification on GLUE",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/packed-bert/packedBERT_question_answering.ipynb b/packed-bert/packedBERT_question_answering.ipynb
new file mode 100644
index 0000000..db5e8b8
--- /dev/null
+++ b/packed-bert/packedBERT_question_answering.ipynb
@@ -0,0 +1,1149 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "51b0ba30",
+   "metadata": {},
+   "source": [
+    "First of all, ensure your environment has the latest version of  [🤗 Optimum Graphcore](https://github.com/huggingface/optimum-graphcore) installed:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4ad38948",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install git+https://github.com/huggingface/optimum-graphcore.git"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b26df2b4",
+   "metadata": {},
+   "source": [
+    "Next, ensure all required packages for this notebook are installed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e98ec027",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "%pip install datasets\n",
+    "%pip install evaluate\n",
+    "%pip install tokenizers\n",
+    "%pip install matplotlib\n",
+    "%pip install scipy\n",
+    "%pip install --force-reinstall huggingface_hub==0.11.1;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "df689f96",
+   "metadata": {},
+   "source": [
+    "Let's start by importing the `transformers` and `optimum.graphcore` libraries, and printing the versions we are using."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aa8d39f7",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import transformers\n",
+    "import optimum.graphcore\n",
+    "\n",
+    "print(transformers.__version__)\n",
+    "print(optimum.graphcore.__version__)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "802f03f9",
+   "metadata": {},
+   "source": [
+    "At the end of this notebook, to be able to share your model with the community and easily access it through HuggingFace, there are some short set-up steps you must follow to enable uploading your checkpoint to the HuggingFace Hub.\n",
+    "\n",
+    "First you have to store your authentication token from the Hugging Face website ([sign up here](https://huggingface.co/join) if you haven't already!) then execute the following cell and input your username and password:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d4c81945",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import notebook_login\n",
+    "\n",
+    "notebook_login()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2b3d049e",
+   "metadata": {},
+   "source": [
+    "Git-lfs must also be installed to enable large file storage when pushing to the hub:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "afa6ac5a",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "! apt install git-lfs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "156ed6e4",
+   "metadata": {},
+   "source": [
+    "# Faster question-answering with SQuAD using PackedBERT\n",
+    "\n",
+    "This notebook describes how to fine-tune BERT from [🤗 Transformers](https://github.com/huggingface/transformers) for question-answering using the SQuAD(v1) dataset using [packing](https://towardsdatascience.com/introducing-packed-bert-for-2x-faster-training-in-natural-language-processing-eadb749962b1), an optimisation method originally used for 2x faster BERT pre-training, which can now also provide massive throughput increases for fine-tuning and batched inference! \n",
+    "\n",
+    "**So, what *is* packing?** The basic idea of 'packing' a dataset is to utilise the requirement for constant-shaped inputs into a model. Instead of padding it with empty, unused space, we can recycle this unused space and fill it with more inputs! The architecture of transformer models like BERT supports this, and lets us optimally use this space to process multiple sequences within one input.\n",
+    "\n",
+    "**And here is why you might want to use it:** Having a single input contain multiple sequences leads to multiple sequences being processed in parallel in a single pass within a single iteration inside a batch, increasing the 'effective' batch size of the model by a considerable factor in many cases, and most importantly, increasing model throughput for training and batched inference significantly.\n",
+    "\n",
+    "The process of training and validating the `BertForQuestionAnswering` model requires some adaptations to accommodate a packed dataset, and this notebook aims to introduce these on top of the [existing process](https://github.com/huggingface/optimum-graphcore/blob/main/notebooks/question_answering.ipynb) for fine-tuning the SQuAD dataset with BERT using an unmodified dataset."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "89898522",
+   "metadata": {},
+   "source": [
+    "Let's initialise our training configurations. \n",
+    "\n",
+    "Note here that we define a 'micro' batch size, which is the local batch size that would be passed into the model on the CPU. In this notebook, we are using both data parallelism and pipeline parallelism (see this [tutorial](https://github.com/graphcore/tutorials/blob/master/tutorials/pytorch/efficient_data_loading/walkthrough.ipynb)), so the 'global' batch size, i.e. the number of data elements passed for one gradient calculation on the IPU, is calculated using the `device_iterations`, `gradient_accumulation_steps`, `replication_factor` and `max_seq_per_pack` (maximum sequences in a pack) for training, such that:\n",
+    "\n",
+    "```\n",
+    "global_training_batch_size = micro_batch_size * device_iterations * gradient_accumulation_steps * replication_factor\n",
+    "```\n",
+    "\n",
+    "Depending on you model and the pod machine you are using, you might need to adjust these three batch-size-related arguments.\n",
+    "\n",
+    "`max_seq_per_pack` highlights the benefit of packing multiple sequences into one input sequence given there is enough space for them. It shows that multiple sequences are processed effectively in parallel within the model, using up space that would essentially be padding if one sequence were passed at a time. This is a much more efficient way to send inputs into the model, and improves the global batch size to a best-case-scenario of:\n",
+    "\n",
+    "```\n",
+    "global_training_batch_size = micro_batch_size * device_iterations * gradient_accumulation_steps * replication_factor * max_seq_per_pack\n",
+    "```\n",
+    "\n",
+    "Realistically, the global batch size will not always be multiplied by the *maximum* number of sequences in a packed sequence, but rather the *average* number of sequences in a packed sequence, and will depend on the sequence length distribution within any given dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0ad1b478",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "model_checkpoint=\"bert-base-uncased\" # Default uncased pre-trained BERT checkpoint\n",
+    "ipu_config_name=\"Graphcore/bert-base-uncased\" # Default Graphcore IPU config initialisation for pre-trained BERT\n",
+    "max_seq_length=512 # The maximum sequence length allowed for sequences in the model.\n",
+    "gradient_accumulation_steps=32 # Gradient accumulation steps for training the model on the IPU.\n",
+    "device_iterations = 32\n",
+    "micro_batch_size=2\n",
+    "model_task=\"squad\" "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "77dde875",
+   "metadata": {},
+   "source": [
+    "Gradients are not calculated during validation, so gradient accumulation is not applicable, and the global batch size for validation can be defined separately as:\n",
+    "\n",
+    "```\n",
+    "global_validation_batch_size=micro_batch_size*device_iterations*replication_factor*max_seq_per_pack\n",
+    "```\n",
+    "\n",
+    "In Optimum, we can define inference-specific `device iterations` and `replication factor`, which can be adjusted to create larger batches to complensate for the lack of a gradient accumulation factor."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "92ae1cca",
+   "metadata": {},
+   "source": [
+    "Values for machine size and cache directories can be configured through environment variables or directly in the notebook:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b882a5b3",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "pod_type = os.getenv(\"GRAPHCORE_POD_TYPE\", \"pod4\")\n",
+    "executable_cache_dir = os.getenv(\"POPLAR_EXECUTABLE_CACHE_DIR\", \"/tmp/\") + \"packed_bert_qa_exe_cache/\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "33597c71",
+   "metadata": {},
+   "source": [
+    "## Loading the dataset\n",
+    "\n",
+    "The next step is to use the [🤗 Datasets](https://github.com/huggingface/datasets) library to download the dataset from the hub, and to use the  [🤗 Evaluate](https://github.com/huggingface/evaluate) library to load the evaluation metrics for the SQuAD model. This will allow easy performance metric analysis during validation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b37cb293",
+   "metadata": {
+    "scrolled": true,
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset, load_metric\n",
+    "import evaluate\n",
+    "\n",
+    "dataset = load_dataset(model_task, cache_dir=os.getenv(\"POPLAR_EXECUTABLE_CACHE_DIR\", \"/tmp/\") + \"packed_bert_qa_dataset_cache/\" # Load dataset\n",
+    "metric = evaluate.load(model_task) # Load metric for dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6dac6eca",
+   "metadata": {},
+   "source": [
+    "The `dataset` object itself is [`DatasetDict`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasetdict), which contains one key for the training, validation and test set:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2115928b",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "23d3f421",
+   "metadata": {},
+   "source": [
+    "To access an actual element, you need to select a split first, then provide an index:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "311b8b73",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "dataset[\"train\"][0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3702f2a3",
+   "metadata": {},
+   "source": [
+    "In the SQuAD dataset, we have a `question`, its `context` i.e., an excerpt of text which includes the answer as well as surrounding context, and the `answer` key, which holds the start position of the answer in the context, as well as the answer itself. For a different or custom question-answering dataset, these fields may have different names but serve the same purpose, so pre-defining them is useful.\n",
+    "\n",
+    "We  have a configuration describing these necessary keys in the dataset containing the raw data that needs to be pre-processed or tokenised before being passed into the model. These generic keys may change for custom datasets, but the usage of them generally stays the same for a similar fine-tuning task."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "628bc41f",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "question_key=\"question\"\n",
+    "context_key=\"context\"\n",
+    "answer_key=\"answers\"\n",
+    "train = True\n",
+    "validate = True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "793dcd19",
+   "metadata": {},
+   "source": [
+    "## Preprocessing the data\n",
+    "\n",
+    "Before we can feed those texts to our model, we need to preprocess them. This is done by a 🤗 Transformers `Tokenizer` which will (as the name indicates) tokenize the inputs (including converting the tokens to their corresponding IDs in the pretrained vocabulary) and put it in a format the model expects, as well as generate the other inputs that model requires.\n",
+    "\n",
+    "To do all of this, we instantiate our tokenizer with the `AutoTokenizer.from_pretrained` method, which will ensure:\n",
+    "\n",
+    "- we get a tokenizer that corresponds to the model architecture we want to use,\n",
+    "- we download the vocabulary used when pretraining this specific checkpoint.\n",
+    "\n",
+    "That vocabulary will be cached, so it's not downloaded again the next time we run the cell.\n",
+    "\n",
+    "The `Dataset` method is also imported, which will allow us to convert our modified and tokenized columns in dictionary form to a dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aab94819",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "from datasets import Dataset \n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a47ea927",
+   "metadata": {},
+   "source": [
+    "For SQuAD, we define a custom function to handle the overflows and offset mapping created by generating tokenised inputs from sequences, as well as the start and end positions of the answers which need to be translated from positions of characters to positions of tokens.\n",
+    "\n",
+    "The first step is to tokenize the dataset using the tokenizer. Note here that for packing, it is important to **not** pad the dataset, so `padding` should be set to `False`. If we pad, we will have to un-pad when packing sequences into a packed sequence, which is inefficient.\n",
+    "\n",
+    "The preprocessing function is outlined in [the original (unpacked) question-answering notebook](question_answering.ipynb) for more information on it. In this case, we can import the preprocessing directly from `utils.packing`, ready *without* padding for PackedBERT."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2263dfef",
+   "metadata": {
+    "scrolled": true,
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from utils.packing.qa_utils import preprocess_packed_qa\n",
+    "\n",
+    "raw_train_dataset = dataset['train']\n",
+    "\n",
+    "tokenized_training_dataset = preprocess_packed_qa(\n",
+    "    dataset=raw_train_dataset,\n",
+    "    tokenizer=tokenizer,\n",
+    "    question_key=question_key,\n",
+    "    context_key=context_key,\n",
+    "    answer_key=answer_key,\n",
+    "    sequence_length=max_seq_length,\n",
+    "    padding=False,\n",
+    "    train=True\n",
+    ")\n",
+    "\n",
+    "\n",
+    "raw_validation_dataset = dataset['validation']\n",
+    "\n",
+    "tokenized_validation_dataset = preprocess_packed_qa(\n",
+    "    dataset=raw_validation_dataset,\n",
+    "    tokenizer=tokenizer,\n",
+    "    question_key=question_key,\n",
+    "    context_key=context_key,\n",
+    "    answer_key=answer_key,\n",
+    "    sequence_length=max_seq_length,\n",
+    "    padding=False,\n",
+    "    train=False\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f57906e8",
+   "metadata": {},
+   "source": [
+    "## Packing the dataset\n",
+    "\n",
+    "To implement packing, we need to pack our dataset first. Each new element will be a \"pack\" containing at most `max_seq_per_pack` sequences."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6bdd1b9e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "max_seq_per_pack = 6"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "51c17c9b",
+   "metadata": {},
+   "source": [
+    "We also define the number of labels in our dataset. For SQuAD, this means the number of outputs, i.e. positions returned by the model - since it is not a classification task, so this is set to 2, to correspond to start and end positions."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bfda406f",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "num_labels = 2\n",
+    "problem_type = 'question_answering'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c39316ea",
+   "metadata": {},
+   "source": [
+    "### Packing algorithm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d9d24ab5",
+   "metadata": {},
+   "source": [
+    "In order to pack efficiently, we will use a histogram-based algorithm: shortest-pack-first histogram packing (SPFHP) presented in the [blog post](https://www.graphcore.ai/posts/introducing-packed-bert-for-2x-faster-training-in-natural-language-processing) adapted from the [blog code](https://github.com/graphcore/tutorials/tree/master/blogs_code/packedBERT). The full process of packing the dataset consists of four steps:\n",
+    "\n",
+    "1. Create a histogram of the sequence lengths of the dataset.\n",
+    "2. Generate the 'strategy' for the dataset using one of the state-of-the-art packing algorithms, which maps out the order and indices of the sequences that need to be packed together.\n",
+    "3. Use this strategy to create the actual dataset, concatenating the tokenized features together for each column in the dataset, including the labels.\n",
+    "4. Finally, pass these new columns into a custom PyTorch dataset, ready to be passed to the PopTorch dataloader!\n",
+    "\n",
+    "These steps have been simplified through the easy-to-use `utils.packing` available in Graphcore Optimum. You can simply generate the packed dataset after the usual tokenization and preprocessing by passing all necessary packing configuration to the `PackedDatasetCreator` class, and generate the ready-to-use PyTorch dataset with `.create()`.\n",
+    "\n",
+    "Within the function, there are some column names used by default. The expected default columns for question-answering include:\n",
+    "* `input_ids`\n",
+    "* `attention_mask`\n",
+    "* `token_type_ids`\n",
+    "* `start_positions`\n",
+    "* `end_positions`\n",
+    "\n",
+    "These should all be generated automatically when tokenizing the SQuAD dataset for BERT.\n",
+    "\n",
+    "The `PackedDatasetCreator` requires different instantiations for different datasets, so it must be called separately for each of our dataset splits. We can set either `training`, `validation` or `inference` to `True` as needed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e66ed06d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from utils.packing.dataset_creator import PackedDatasetCreator\n",
+    "\n",
+    "train_data_packer = PackedDatasetCreator(\n",
+    "    tokenized_dataset = tokenized_training_dataset,\n",
+    "    max_sequence_length = max_seq_length,\n",
+    "    max_sequences_per_pack = max_seq_per_pack,\n",
+    "    training = True,\n",
+    "    num_labels = num_labels,\n",
+    "    problem_type = problem_type,\n",
+    "    algorithm = 'SPFHP'\n",
+    ")\n",
+    "\n",
+    "val_data_packer = PackedDatasetCreator(\n",
+    "    tokenized_dataset = tokenized_validation_dataset,\n",
+    "    max_sequence_length = max_seq_length,\n",
+    "    max_sequences_per_pack = max_seq_per_pack,\n",
+    "    validation = True,\n",
+    "    num_labels = num_labels,\n",
+    "    problem_type = problem_type,\n",
+    "    algorithm = 'SPFHP'\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "720ea314",
+   "metadata": {},
+   "source": [
+    "This will create the strategy and initialise the necessary parameters for packing the dataset. We can see that the ideal speed-up we have achieved is approximately 2.2x the original dataset, which corresponds directly to the average packing factor: the average number of sequences within one pack.\n",
+    "\n",
+    "The `PackedDatasetCreator` class also has some other features we do not use here for training, such as `pad_to_global_batch_size`, a feature useful for performing batched inference on a large samples when we do not want to lose any of the samples when creating data iterators using the `poptorch.Dataloader`, it applies 'vertical' padding to the dataset, adding filler rows to bring the dataset up to a value divisible by the global batch size, and allows for the largest possible batch sizes to be used without any loss of data."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "46319488",
+   "metadata": {},
+   "source": [
+    "You can also view the histogram generated in the first step of the packing process, to observe whether the distribution of sequence lengths in the dataset will benefit from packing - as a general rule, as long as the average length of the sequences in the dataset is 50% or less of the maximum sequence length, packing will offer at least a 2x throughput benefit, in other words: `throughput_increase ≈ max_seq_len/mean_seq_len`\n",
+    "\n",
+    "Many datasets have distributions with much smaller average lengths, and will benefit much more. We can easily observe this distribution by retrieving and plotting the histogram from the data class:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "113b58f4",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "train_histogram = train_data_packer.histogram\n",
+    "\n",
+    "plt.hist(train_histogram, bins = [k for k in range(0,max_seq_length,10)]) \n",
+    "plt.title(\"Sequence length histogram\") \n",
+    "plt.xlabel('Sequence lengths')\n",
+    "plt.ylabel('Frequency')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1d077b97",
+   "metadata": {},
+   "source": [
+    "Now we need to create the actual packed dataset, this is the 3rd step of the packing process outlined above.\n",
+    "\n",
+    "In this stage, we take the strategy for mapping the sequences by size into 'packs' that was generated by the packing algorithm, and use this to extract the sequences from the tokenized dataset, inserting them into packs for each column in the dataset. Any remaining space in a pack after the sequences have been concatenated is padded to bring all sequences up to the maximum sequence length.\n",
+    "\n",
+    "**Some key features unique to packed datasets are worth mentioning here**:\n",
+    "\n",
+    "- A specific `attention_mask` is generated: It contains a unique index for each sequence of the pack and `0` for the remaining padding tokens. This, essentially, tells the model where to \"look\" from the perspective of a single token, ignoring any encoded information (such as a different sequence) that is not relevant to that token.\n",
+    "    - Example of 3 sequences in a pack: `attention_mask = [1,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,0,0,0]`\n",
+    "    - Compared to a single sequence in an unpacked input `attention_mask = [1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0]`\n",
+    "    \n",
+    "\n",
+    "- The `position_ids` of a pack contain the concatenated `position_ids` of each sequences \n",
+    "    - For instance given 3 sequences: `[0,1,2,3,4] + [0,1,2,3] + [0,1,2] -> [1,2,3,4,1,2,3,1,2,...,0,0,0]` (note: the CLS tokens position id '0' are also moved the end of the pack)\n",
+    "    \n",
+    "    \n",
+    "- For SQuAD, during training, answers are determined using a start position and end position within the sequence. During preprocessing, these were converted from character positions to token positions. Now, during packing, as tokenized sequences are effectively being concatenated along the same dimension, the positions of the answer will change for any sequence that is not starting at index 0 within a pack. For example, in a pack with 2 sequences:\n",
+    "    - Answer positions before packing:\n",
+    "    ```\n",
+    "    Length of sequence 1: 100 tokens (index 0 to 99)   , start position: 30, end position: 35\n",
+    "    Length of sequence 2: 120 tokens (index 0 to 119)  , start position: 15, end position: 25\n",
+    "    ```\n",
+    "    - Answer positions after packing:\n",
+    "    ```\n",
+    "    Length of sequence 1 in pack 1: 100 tokens (index 0 to 99)   , start position: 30, end position: 35\n",
+    "    Length of sequence 2 in pack 1: 120 tokens (index 100 to 219), start position: 115, end position: 125 \n",
+    "    ```\n",
+    "\n",
+    "    - The positions have been shifted by the total length of preceding sequences in the pack,  We call this the `positions_offset`.\n",
+    "\n",
+    "\n",
+    "To create a dataloader-ready packed dataset, all you need to do is call the `create()` method:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bdcc161d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "packed_train_dataset = train_data_packer.create()\n",
+    "packed_val_dataset = val_data_packer.create()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ce443c8f",
+   "metadata": {},
+   "source": [
+    "Let's visualize one sample of the new `packed_train_dataset`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c966cd9a",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "packed_train_dataset[133]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a1d8ce9c",
+   "metadata": {},
+   "source": [
+    "## Fine-tuning the model\n",
+    "\n",
+    "Now that our data is ready, we can download the pretrained model and fine-tune it.\n",
+    "\n",
+    "### Implement Packed BERT\n",
+    "\n",
+    "Some model modifications are required to make packing work with BERT. For SQuAD, we create a custom output class to separate the logits according to each of the sequences within the pack and calculate the loss. The existing class `BertForQuestionAnswering` is extended to `PipelinedPackedBertForQuestionAnswering` which incorporates the required modifications to the model. The crux of these changes is to introduce the new attention mask, and modify the hidden layer output of the model to mask any padded inputs from the logits.\n",
+    "\n",
+    "First let's load a default BERT configuration using `AutoConfig`. The config includes a new parameter we must set, `max_sequences_per_pack`, this informs the model of the maximum number of sequences it will need to 'unpack' in the model output. It also allows us to clearly define the `num_labels` and `problem_type` for this model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "254a0f83",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from transformers import AutoConfig\n",
+    "\n",
+    "config = AutoConfig.from_pretrained(model_checkpoint)\n",
+    "config.max_sequences_per_pack = max_seq_per_pack\n",
+    "config.num_labels = num_labels"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0b7dae37",
+   "metadata": {},
+   "source": [
+    "Now we can instantiate the model class with the config, loading the weights from the model checkpoint. For SQuAD, we can determine the number of \"labels\" as the two output types that will determine whether answers are correct or not, i.e., the start and end position."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3285aaa3",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import numpy as np\n",
+    "torch.manual_seed(43)\n",
+    "np.random.seed(43)\n",
+    " \n",
+    "from models.modeling_bert_packed import PipelinedPackedBertForQuestionAnswering\n",
+    "\n",
+    "model = PipelinedPackedBertForQuestionAnswering.from_pretrained(model_checkpoint, config=config)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d6070000",
+   "metadata": {},
+   "source": [
+    "The warning is telling us we are throwing away some weights and randomly initializing others. This is absolutely normal in this case, because we are removing the head used to pretrain the model on a masked language modeling objective and replacing it with a new head for question answering, for which we don't have pretrained weights, so the library warns us we should fine-tune this model before using it for inference, which is exactly what we are going to do.\n",
+    "\n",
+    "We can first test the model on CPU."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "02aac4e1",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# test the model on CPU\n",
+    "from transformers.data.data_collator import default_data_collator\n",
+    "\n",
+    "loader = torch.utils.data.DataLoader(packed_train_dataset,\n",
+    "                             batch_size=2,\n",
+    "                             shuffle=True,\n",
+    "                             drop_last=True,\n",
+    "                             collate_fn=default_data_collator)\n",
+    "data = next(iter(loader))\n",
+    "o = model(**data)\n",
+    "print(\"Logits shape:\", o)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5c9922fa",
+   "metadata": {},
+   "source": [
+    "Now, let's prepare the model for IPU.\n",
+    "\n",
+    "First, we set the model in half precision:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11502853",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "model.half()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7a9ab06f",
+   "metadata": {},
+   "source": [
+    "### Define validation metrics for SQuAD\n",
+    "\n",
+    "Before training and evaluating, a custom postprocessing function needs to be defined for SQuAD. This is because we need to map the predictions of the model back to parts of the context in terms of the character positions in the original untokenized samples. The model predicts logits for the start and end token position of the answer.\n",
+    "\n",
+    "The purpose of the function is to identify each of the tokenized features according to their `example_ids` and map the start and end token positions for the output, taking the top-*n* logit indices and discarding all invalid solutions. It then uses the `offset_mapping` to map the start and end token-level positions back to character-level positions within the context, and generates a text answer using the original context. This text prediction can then be used to calculate accuracy metrics and compared to the target answer present in the dataset.\n",
+    "\n",
+    "The `postprocess_qa_predictions()` function is adapted for packing, taken directly from the existing [tutorial for SQuAD finetuning for the IPU](https://github.com/huggingface/optimum-graphcore/blob/main/notebooks/question_answering.ipynb) for an unpacked dataset. The full description for the use of this function is described in that tutorial. \n",
+    "\n",
+    "The main changes to the function for packing include: \n",
+    "* Instead of iterating through all the features in the tokenized dataset, and obtaining the `example_id` field created during tokenization of the validation dataset, this function iterates through each feature within each pack, obtaining the corresponding `example_id` for each feature within the pack. \n",
+    "\n",
+    "* It saves the index of the pack in the dataset, **as well as the index of the feature within the pack**, to allow the function to easily and linearly obtain the features to perform validation on.\n",
+    "\n",
+    "This postprocessing is available ready-to-use from the packing utils: `utils.packing`, and can simply be initialised."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f4c6e4d0",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from utils.packing.qa_utils import postprocess_packed_qa_predictions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "38c76b0b",
+   "metadata": {},
+   "source": [
+    "Finally, a `compute_validation_metrics` function is created to take in the postprocessed predictions. This obtains the answers from the dataset, maps them according to the `example_id` to the corresponding prediction, and uses `metric` from the 🤗 Evaluate library to compute the relevant metrics for SQuAD, including an \"exact match\" accuracy, as well as F1 score, for each answer. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5420ef6a",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def compute_validation_metrics(predictions, raw_validation_dataset, packed_validation_dataset_unformatted, metric):\n",
+    "    \n",
+    "    target_answers = [\n",
+    "        {\"id\": ex[\"id\"], \"answers\": ex[\"answers\"]} for ex in raw_validation_dataset\n",
+    "    ]\n",
+    "    \n",
+    "    final_predictions = postprocess_packed_qa_predictions(\n",
+    "        raw_validation_dataset, packed_validation_dataset_unformatted, predictions\n",
+    "    )\n",
+    "\n",
+    "    formatted_predictions = [\n",
+    "        {\"id\": k, \"prediction_text\": v} for k, v in final_predictions.items()\n",
+    "    ]\n",
+    "\n",
+    "    metrics = metric.compute(predictions=formatted_predictions, references=target_answers)\n",
+    "    \n",
+    "    return metrics\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "acdde982",
+   "metadata": {},
+   "source": [
+    "### Train and validate the model using the 🤗 Optimum Graphcore `Trainer`\n",
+    "\n",
+    "Now let's prepare the model for IPU, instantiate the options and machine configurations and create an IPU Trainer to efficiently and easily perform training on the IPU in just a few lines.\n",
+    "\n",
+    "We need to define the `IPUConfig`, which is a class that specifies attributes and configuration parameters to compile and put the model on the device. We initialize it with one config name or path, which we set earlier. Then we use it to set the mode attribute `model.ipu_config` "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e2073fcd",
+   "metadata": {},
+   "source": [
+    "As we are using a pre-trained checkpoint, we can use the existing IPU configuration for `\"Graphcore/bert-base-uncased\"`for the custom model. This should require no changes as even though the model has been modified to be compatible with a packed dataset, the pipelining stages and IPU options will remain the same. \n",
+    "\n",
+    "Some of the options have been specified when defining the `ipu_config` to highlight the global batch size. This uses the configurations defined at the beginning of this script. Note that we can also define inference specific device iterations and replication factors for performing validation on the model, to modify the validation global batch size."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8b0452e1",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from optimum.graphcore import IPUConfig, IPUTrainer, IPUTrainingArguments\n",
+    "\n",
+    "ipu_config = IPUConfig.from_pretrained(\n",
+    "    ipu_config_name,\n",
+    "    executable_cache_dir = executable_cache_dir,\n",
+    "    gradient_accumulation_steps=gradient_accumulation_steps,\n",
+    "    device_iterations=device_iterations,\n",
+    "    replication_factor=1,\n",
+    "    embedding_serialization_factor=1,\n",
+    "    inference_device_iterations= 64,\n",
+    "    inference_replication_factor=1,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0a6b8635",
+   "metadata": {},
+   "source": [
+    "To instantiate an `IPUTrainer`, we will need to define `IPUTrainingArguments`, which is a class that contains all the attributes to customize the training. It requires one folder name, which will be used to save the checkpoints of the model, and all other arguments are optional:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "141a2e2d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "training_args = IPUTrainingArguments(\n",
+    "    output_dir=f\"./{model_checkpoint}-{model_task}\",\n",
+    "    per_device_train_batch_size=micro_batch_size,\n",
+    "    per_device_eval_batch_size=8,\n",
+    "    num_train_epochs=3,\n",
+    "    learning_rate=9e-05,\n",
+    "    loss_scaling=64.0,\n",
+    "    weight_decay=0.01,\n",
+    "    warmup_ratio=0.25,\n",
+    "    lr_scheduler_type='cosine',\n",
+    "    pod_type=pod_type,\n",
+    "    gradient_accumulation_steps=gradient_accumulation_steps,\n",
+    "    dataloader_mode=\"async_rebatched\",\n",
+    "    dataloader_drop_last=True,\n",
+    "    dataloader_num_workers=64,\n",
+    "    logging_steps=5\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c5e150ed",
+   "metadata": {},
+   "source": [
+    "**Note that we do not set evaluation to be performed during the training process for SQuAD**. This is due to the custom postprocessing steps required to extract text-level answers for SQuAD, for which the logits cannot be easily modified without multiple function inputs, such as the tokenized and raw datasets, while the `preprocess_logits_for_metrics` argument provided in `IPUTrainingArguments` can only utilise logits alone. Therefore, validation is done after training."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eeb965d0",
+   "metadata": {},
+   "source": [
+    "We will need a data collator that will batch our processed examples together, here we will use the default data collator imported from the Transformers library. This is passed to the `IPUTrainer` class. \n",
+    "\n",
+    "Then we just need to pass all of this along with our datasets to the IPUTrainer:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "561a41ca",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from transformers import default_data_collator\n",
+    "\n",
+    "trainer = IPUTrainer(\n",
+    "    model=model,\n",
+    "    ipu_config=ipu_config,\n",
+    "    args=training_args,\n",
+    "    train_dataset=packed_train_dataset,\n",
+    "    data_collator=default_data_collator\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "07b0933f",
+   "metadata": {},
+   "source": [
+    "We can now finetune our model by just calling the train method:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c4cbe563",
+   "metadata": {
+    "scrolled": true,
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "train_run_metrics = trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c31ad4dc",
+   "metadata": {},
+   "source": [
+    "You can now upload the result of the training to the Hub if you successfully logged in at the beginning of this notebook, just execute this instruction:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b9e60061",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# trainer.push_to_hub()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a6d0fc29",
+   "metadata": {},
+   "source": [
+    "Then save the model with the model checkpoint name."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "625847dc",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "saved_model_checkpoint = Path(os.getenv('CHECKPOINT_DIR', '/tmp/')) + f\"{model_checkpoint}-{model_task}\"\n",
+    "trainer.save_model(saved_model_checkpoint)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c93fb854",
+   "metadata": {},
+   "source": [
+    "We can then perform the evaluation by using the `IPUTrainer`'s `predict` functionality. This provides all of the raw predictions for the packed inputs for validation. This will, be default, use the global batch size defined specifically for inference in the `IPUTrainingArguments`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c65f6830",
+   "metadata": {
+    "scrolled": true,
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "raw_predictions = trainer.predict(packed_val_dataset)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7553b34d",
+   "metadata": {},
+   "source": [
+    "Once the predictions have been obtained, the validation metrics can be computed by passing them into the `compute_validation_metrics` function. This, as described previously, performs the necessary postprocessing on the logits and obtains text answers, then computes the accuracy metrics (exact match and F1 score) for SQuAD finetuning."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "825dd9a8",
+   "metadata": {
+    "scrolled": true,
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "val_metrics = compute_validation_metrics(\n",
+    "    raw_predictions.predictions, raw_validation_dataset, packed_val_dataset, metric)\n",
+    "\n",
+    "print(val_metrics)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "50eb0d90",
+   "metadata": {},
+   "source": [
+    "## Faster batched inference"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6bda303c",
+   "metadata": {},
+   "source": [
+    "Packing can also be used for inference, particularly for performing inference for workloads. This section demonstrates how to perform faster, batched inference with a large number of samples using a super-easy custom pipeline which batches and packs your input data, performs inference and returns postprocessed predictions. \n",
+    "\n",
+    "For the pipeline, we need to import it, and initialise a few essential parameters.\n",
+    "\n",
+    "The `model` is the model checkpoint, we are going to use the locally saved checkpoint generated from training SQuAD. The `executable_cache_dir` and `max_seq_length` must also be specified.  If you are loading a saved model without a pre-trained tokenizer saved in the checkpoint folder, it will be loaded automatically from `bert-base-uncased`, if you wish to load a different pre-trained tokenizer, you can specify this by passing the `pretrained_tokenizer` argument with the name of your tokenizer to the `PackedBertQuestionAnsweringPipeline`.\n",
+    "\n",
+    "The pipeline will automatically determine your model's IPU config, given that the checkpoint was trained using Optimum Graphcore, which will be the case for the model fine-tuned in this notebook.\n",
+    "\n",
+    "In this example, we pre-load the IPUConfig and modify some of the default parameters to get the best performance out of inference and leverage the benefits of IPU parallelism. The micro-batch size can also be specified, for which the default is 1.\n",
+    "\n",
+    "When training, the packing factor affects the convergence the same way as a large increase in batch size would do. However, for inference, we are free to use a bigger packing factor to speed it up. Let's try it with `max_seq_per_pack = 12`.\n",
+    "\n",
+    "**Note:** Packing brings huge benefits for performing inference on large amounts of data. For small scale inference tasks, such as those which more suit sequential inference on a single un-batched input, the generic Optimum Graphcore `TextClassificationPipeline` may be prefered. This won't affect fine-tuning, the weights generated from fine-tuning using packing will work just the same!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "50cd4b0e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from pipeline.packed_bert import PackedBertQuestionAnsweringPipeline\n",
+    "\n",
+    "model = saved_model_checkpoint\n",
+    "# model = 'your_username/{model_checkpoint}-{model_task}' # uncomment this and use your username to load from Hugging Face Hub\n",
+    "\n",
+    "inference_boosted_ipu_config = IPUConfig.from_pretrained(model, \n",
+    "        inference_device_iterations=32,\n",
+    "        inference_replication_factor=4,\n",
+    "        ipus_per_replica=1,\n",
+    "        layers_per_ipu=[12]\n",
+    "    )\n",
+    "\n",
+    "pipeline = PackedBertQuestionAnsweringPipeline(\n",
+    "    model = model,\n",
+    "    executable_cache_dir = executable_cache_dir,\n",
+    "    max_seq_per_pack=12,\n",
+    "    max_seq_length=max_seq_length,\n",
+    "    ipu_config=inference_boosted_ipu_config,\n",
+    "    micro_batch_size=8\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "42d720aa",
+   "metadata": {},
+   "source": [
+    "The pipeline expects a **list of strings** directly passed to it in the format: \n",
+    "```\n",
+    "questions=[<list of questions>], contexts=[<list of contexts>]\n",
+    "```\n",
+    "There is no need to tokenize, preprocess, pack or postprocess the data to use the inference pipeline.\n",
+    "\n",
+    "As a test, we can load the entire SQuAD validation dataset and perform packed inference using `.predict()` on the text column to generate predictions. Postprocessing samples for SQuAD is done on a sample-by-sample, unbatched basis so this may take a few minutes with or without packing."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "81969920",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import datasets\n",
+    "dataset = datasets.load_dataset('squad')\n",
+    "preds = pipeline.predict(questions=dataset['validation']['question'],\n",
+    "                         contexts=dataset['validation']['context'])\n",
+    "\n",
+    "print(preds.keys())\n",
+    "print(f\"Number of predictions: {len(preds['predictions'])}\")\n",
+    "print(f\"Preprocessing time: {preds['preprocessing_time']}s\")\n",
+    "print(f\"Postprocessing time: {preds['postprocessing_time']}s\")\n",
+    "print(f\"Throughput: {preds['throughput']} samples/s\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3c77e5ac",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "There is minimal overhead from tokenizing and packing the dataset, but the speed benefits for inference are evident. Running the above pipeline, we achieve a throughput approximately 6000 samples per second, showing an approximate 2x speed up for SQuAD."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/packed-bert/packedBERT_single_label_text_classification.ipynb b/packed-bert/packedBERT_single_label_text_classification.ipynb
new file mode 100644
index 0000000..f2adfd3
--- /dev/null
+++ b/packed-bert/packedBERT_single_label_text_classification.ipynb
@@ -0,0 +1,1366 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "X4cRE8IbIrIV"
+   },
+   "source": [
+    "First of all, make sure your environment has the latest version of [🤗 Optimum Graphcore](https://github.com/huggingface/optimum-graphcore) installed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "%pip install git+https://github.com/huggingface/optimum-graphcore.git"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Also make sure all the packages required for this notebook are installed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "%pip install scikit-learn;\n",
+    "%pip install datasets\n",
+    "%pip install evaluate\n",
+    "%pip install tokenizers\n",
+    "%pip install matplotlib\n",
+    "%pip install scipy\n",
+    "%pip install --force-reinstall huggingface_hub==0.11.1;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's start by importing and printing out the versions of `Transformers` and `Optimum Graphcore`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import transformers\n",
+    "import optimum.graphcore\n",
+    "\n",
+    "print(transformers.__version__)\n",
+    "print(optimum.graphcore.__version__)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "At the end of this notebook, to be able to share your model with the community and easily access it through HuggingFace, there are some short set-up steps you must follow to enable uploading your checkpoint to the HuggingFace Hub.\n",
+    "\n",
+    "First you have to store your authentication token from the Hugging Face website ([sign up here](https://huggingface.co/join) if you haven't already!) then execute the following cell and input your username and password:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import notebook_login\n",
+    "\n",
+    "notebook_login()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Git-lfs must also be installed to enable large file storage when pushing to the hub:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "! apt install git-lfs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "rEJBSTyZIrIb"
+   },
+   "source": [
+    "# Faster single-label text classification with PackedBERT \n",
+    "\n",
+    "This notebook builds on the process of [fine-tuning BERT on a text classification task](text_classification.ipynb), using [packing](https://www.graphcore.ai/posts/introducing-packed-bert-for-2x-faster-training-in-natural-language-processing), an optimisation method originally used for 2x faster BERT pre-training, which can now also provide massive throughput increases for fine-tuning and batched inference! \n",
+    "\n",
+    "**So, what *is* packing?** The basic idea of 'packing' a dataset is to utilise the requirement for constant-shaped inputs into a model. Instead of padding it with empty, unused space, we can recycle this unused space and fill it with more inputs! The architecture of transformer models like BERT supports this, and lets us optimally use this space to process multiple sequences within one input.\n",
+    "\n",
+    "**And here is why you might want to use it:** Having a single input contain multiple sequences leads to multiple sequences being processed in parallel in a single pass within a single iteration inside a batch, increasing the 'effective' batch size of the model by a considerable factor in many cases, and most importantly, increasing model throughput for training and batched inference significantly.\n",
+    "\n",
+    "This notebook outlines how to easily enable packing for BERT when performing fine-tuning/inference on a text-classification task in 🤗 Optimum, resulting in an impressive 5-6x faster training and inference run-time on the `GLUE/sst2` dataset. \n",
+    "\n",
+    "You can read more about packing in the original [paper](https://arxiv.org/abs/2107.02027)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "kTCFado4IrIc"
+   },
+   "source": [
+    "In this notebook, we will see how to fine-tune BERT, a [🤗 Transformers](https://github.com/huggingface/transformers) model to a text classification task of the [GLUE Benchmark](https://gluebenchmark.com/).\n",
+    "\n",
+    "The GLUE Benchmark is a group of nine classification tasks on sentences or pairs of sentences, which are:\n",
+    "\n",
+    "- [CoLA](https://nyu-mll.github.io/CoLA/) (Corpus of Linguistic Acceptability) Determine if a sentence is grammatically correct or not.is a  dataset containing sentences labeled grammatically correct or not.\n",
+    "- [MNLI](https://arxiv.org/abs/1704.05426) (Multi-Genre Natural Language Inference) Determine if a sentence entails, contradicts or is unrelated to a given hypothesis. (This dataset has two versions, one with the validation and test set coming from the same distribution, another called mismatched where the validation and test use out-of-domain data.)\n",
+    "- [MRPC](https://www.microsoft.com/en-us/download/details.aspx?id=52398) (Microsoft Research Paraphrase Corpus) Determine if two sentences are paraphrases from one another or not.\n",
+    "- [QNLI](https://rajpurkar.github.io/SQuAD-explorer/) (Question-answering Natural Language Inference) Determine if the answer to a question is in the second sentence or not. (This dataset is built from the SQuAD dataset.)\n",
+    "- [QQP](https://data.quora.com/First-Quora-Dataset-Release-Question-Pairs) (Quora Question Pairs2) Determine if two questions are semantically equivalent or not.\n",
+    "- [RTE](https://aclweb.org/aclwiki/Recognizing_Textual_Entailment) (Recognizing Textual Entailment) Determine if a sentence entails a given hypothesis or not.\n",
+    "- [SST-2](https://nlp.stanford.edu/sentiment/index.html) (Stanford Sentiment Treebank) Determine if the sentence has a positive or negative sentiment.\n",
+    "- [STS-B](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark) (Semantic Textual Similarity Benchmark) Determine the similarity of two sentences with a score from 1 to 5.\n",
+    "- [WNLI](https://cs.nyu.edu/faculty/davise/papers/WinogradSchemas/WS.html) (Winograd Natural Language Inference) Determine if a sentence with an anonymous pronoun and a sentence with this pronoun replaced are entailed or not. (This dataset is built from the Winograd Schema Challenge dataset.)\n",
+    "\n",
+    "We will see how to easily load the dataset for these tasks and use BERT with packing to fine-tune a model on SST-2. Each task is named using an acronym, with `mnli-mm` standing for the 'mis-matched' version of MNLI (so it is the same training set as `mnli` but with different validation and test sets):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "YZbiBDuGIrId",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "GLUE_TASKS = [\"cola\", \"mnli\", \"mnli-mm\", \"mrpc\", \"qnli\", \"qqp\", \"rte\", \"sst2\", \"stsb\", \"wnli\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**For this Packed BERT demo, we cover (single-label) sequence classification on the `sst2` dataset. The `task` can be changed to run the other `GLUE` tasks. However, training hyperparameters may need further tuning for these other tasks.**"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "4RRkXuteIrIh"
+   },
+   "source": [
+    "Let's initialise our training configurations. \n",
+    "\n",
+    "In this notebook, we are using both data parallelism and pipeline parallelism (see this [tutorial](https://github.com/graphcore/tutorials/blob/master/tutorials/pytorch/efficient_data_loading/walkthrough.ipynb) for more). Therefore the global batch size, which is the actual number of samples used for the weight update, is determined using four factors:\n",
+    "\n",
+    "    global batch size = micro_batch_size * gradient accumulation steps * device iterations * replication factor\n",
+    "\n",
+    "Replication factor is determined by `pod_type`, which will be used as a key to select the replication factor from a dictionary defined in the IPU config file. For example, the dictionary in the IPU config file [Graphcore/roberta-base-ipu](https://huggingface.co/Graphcore/roberta-base-ipu/blob/main/ipu_config.json) looks like this.:\n",
+    "\n",
+    "    \"replication_factor\": {\"pod4\": 1, \"pod8\": 2, \"pod16\": 4, \"pod32\": 8, \"pod64\": 16, \"default\": 1}\n",
+    "\n",
+    "Depending on your model and the pod machine you are using, you might need to adjust these three batch-size-related arguments.\n",
+    "\n",
+    "By default this notebook is configured to run on 4 IPUs.\n",
+    "\n",
+    "Finally, `max_seq_length` is the maximum length a sequence can be, and all sequences will be padded to this length, so it should not be larger than the maximum length of the model. Set these parameters and the rest of the notebook should run smoothly:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Given the small size of the sequences in `sst2`, we can reduce the model maximum input size to `max_seq_length = 256`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "zVvslsfMIrIh",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "task = \"sst2\"\n",
+    "model_checkpoint = \"bert-base-uncased\"\n",
+    "ipu_config_name = \"Graphcore/bert-base-uncased\"\n",
+    "micro_batch_size = 2\n",
+    "gradient_accumulation_steps = 32\n",
+    "device_iterations = 32\n",
+    "max_seq_length = 256"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Gradients are not calculated during validation, so gradient accumulation is not applicable, and the global batch size for validation can be defined separately as:\n",
+    "\n",
+    "```\n",
+    "global_validation_batch_size=device_iterations*replication_factor*max_seq_per_pack\n",
+    "```\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "Values for machine size and cache directories can be configured through environment variables or directly in the notebook:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "pod_type = os.getenv(\"GRAPHCORE_POD_TYPE\", \"pod4\")\n",
+    "executable_cache_dir = os.getenv(\"POPLAR_EXECUTABLE_CACHE_DIR\", \"/tmp/\") + \"packed_bert_slseqcls_exe_cache/\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "whPRbBNbIrIl"
+   },
+   "source": [
+    "## Loading the dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "W7QYTpxXIrIl"
+   },
+   "source": [
+    "We will use the [🤗 Datasets](https://github.com/huggingface/datasets) library and the  [🤗 Evaluate](https://github.com/huggingface/evaluate) library to download the data and get the metric we need to use for evaluation (to compare our model to the benchmark). This can be easily done with the functions `load_dataset` and `evaluate.load()`.  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "IreSlFmlIrIm",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "import evaluate"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "CKx2zKs5IrIq"
+   },
+   "source": [
+    "Apart from `mnli-mm` being a special code, we can directly pass our task name to those functions. `load_dataset` will cache the dataset to avoid downloading it again the next time you run this cell."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true,
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "actual_task = \"mnli\" if task == \"mnli-mm\" else task\n",
+    "dataset = load_dataset(\"glue\", actual_task, cache_dir=os.getenv(\"POPLAR_EXECUTABLE_CACHE_DIR\", \"/tmp/\") + \"packed_bert_slseqcls_dataset_cache/\")\n",
+    "metric = evaluate.load('glue', actual_task)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "RzfPtOMoIrIu"
+   },
+   "source": [
+    "The `dataset` object itself is [`DatasetDict`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasetdict), which contains one key for the training, validation and test set (with more keys for the mismatched validation and test set in the special case of `mnli`)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "GWiVUF0jIrIv",
+    "outputId": "35e3ea43-f397-4a54-c90c-f2cf8d36873e",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "u3EtYfeHIrIz"
+   },
+   "source": [
+    "To access an actual element, you need to select a split first, then give an index:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "X6HrpprwIrIz",
+    "outputId": "d7670bc0-42e4-4c09-8a6a-5c018ded7d95",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "dataset[\"train\"][:10]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "WHUmphG3IrI3"
+   },
+   "source": [
+    "To get a sense of what the data looks like, the following function will show some examples picked randomly in the dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "i3j8APAoIrI3",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import datasets\n",
+    "import random\n",
+    "import pandas as pd\n",
+    "from IPython.display import display, HTML\n",
+    "\n",
+    "def show_random_elements(dataset, num_examples=10):\n",
+    "    assert num_examples <= len(dataset), \"Can't pick more elements than there are in the dataset.\"\n",
+    "    picks = []\n",
+    "    for _ in range(num_examples):\n",
+    "        pick = random.randint(0, len(dataset)-1)\n",
+    "        while pick in picks:\n",
+    "            pick = random.randint(0, len(dataset)-1)\n",
+    "        picks.append(pick)\n",
+    "    \n",
+    "    df = pd.DataFrame(dataset[picks])\n",
+    "    for column, typ in dataset.features.items():\n",
+    "        if isinstance(typ, datasets.ClassLabel):\n",
+    "            df[column] = df[column].transform(lambda i: typ.names[i])\n",
+    "    display(HTML(df.to_html()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "SZy5tRB_IrI7",
+    "outputId": "ba8f2124-e485-488f-8c0c-254f34f24f13",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "show_random_elements(dataset[\"train\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "lnjDIuQ3IrI-"
+   },
+   "source": [
+    "The metric is an instance of [`datasets.Metric`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Metric):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "5o4rUteaIrI_",
+    "outputId": "18038ef5-554c-45c5-e00a-133b02ec10f1",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "metric"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "jAWdqcUBIrJC"
+   },
+   "source": [
+    "You can call its `compute` method with your predictions and labels directly and it will return a dictionary with the metric(s) value:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "6XN1Rq0aIrJC",
+    "outputId": "a4405435-a8a9-41ff-9f79-a13077b587c7",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "fake_preds = np.random.randint(0, 2, size=(64,))\n",
+    "fake_labels = np.random.randint(0, 2, size=(64,))\n",
+    "metric.compute(predictions=fake_preds, references=fake_labels)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "YOCrQwPoIrJG"
+   },
+   "source": [
+    "Note that `load_metric` has loaded the proper metric associated to your task, which is:\n",
+    "\n",
+    "- for CoLA: [Matthews Correlation Coefficient](https://en.wikipedia.org/wiki/Matthews_correlation_coefficient)\n",
+    "- for MNLI (matched or mismatched): Accuracy\n",
+    "- for MRPC: Accuracy and [F1 score](https://en.wikipedia.org/wiki/F1_score)\n",
+    "- for QNLI: Accuracy\n",
+    "- for QQP: Accuracy and [F1 score](https://en.wikipedia.org/wiki/F1_score)\n",
+    "- for RTE: Accuracy\n",
+    "- for SST-2: Accuracy\n",
+    "- for STS-B: [Pearson Correlation Coefficient](https://en.wikipedia.org/wiki/Pearson_correlation_coefficient) and [Spearman's_Rank_Correlation_Coefficient](https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient)\n",
+    "- for WNLI: Accuracy\n",
+    "\n",
+    "so the metric object only computes the one(s) needed for your task."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "n9qywopnIrJH"
+   },
+   "source": [
+    "## Preprocessing the data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "YVx71GdAIrJH"
+   },
+   "source": [
+    "Before we can feed the texts to our model, we need to preprocess them. This is done by a 🤗 Transformers `Tokenizer` which will (as the name indicates) tokenize the inputs (including converting the tokens to their corresponding IDs in the pretrained vocabulary) and put it in a format the model expects, as well as generate the other inputs that model requires.\n",
+    "\n",
+    "To do all of this, we instantiate our tokenizer with the `AutoTokenizer.from_pretrained` method, which will ensure:\n",
+    "\n",
+    "- we get a tokenizer that corresponds to the model architecture we want to use,\n",
+    "- we download the vocabulary used when pretraining this specific checkpoint.\n",
+    "\n",
+    "That vocabulary will be cached, so it's not downloaded again the next time we run the cell."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "eXNLu_-nIrJI",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "    \n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Vl6IidfdIrJK"
+   },
+   "source": [
+    "We pass along `use_fast=True` to the call above to use one of the fast tokenizers (backed by Rust) from the 🤗 Tokenizers library. Those fast tokenizers are available for almost all models, but if you got an error with the previous call, remove that argument."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "rowT4iCLIrJK"
+   },
+   "source": [
+    "You can directly call this tokenizer on one sentence or a pair of sentences:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "a5hBlsrHIrJL",
+    "outputId": "acdaa98a-a8cd-4a20-89b8-cc26437bbe90",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "tokenizer(\"Hello, this is one sentence!\", \"And this sentence goes with it.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "qo_0B1M2IrJM"
+   },
+   "source": [
+    "Depending on the model you selected, you will see different keys in the dictionary returned by the cell above. They don't matter much for what we're doing here (just know they are required by the model we will instantiate later), you can learn more about them in [this tutorial](https://huggingface.co/transformers/preprocessing.html) if you're interested.\n",
+    "\n",
+    "To preprocess our dataset, we will thus need the names of the columns containing the sentence(s). The following dictionary keeps track of the correspondence task to column names:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "fyGdtK9oIrJM",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "task_to_keys = {\n",
+    "    \"cola\": (\"sentence\", None),\n",
+    "    \"mnli\": (\"premise\", \"hypothesis\"),\n",
+    "    \"mnli-mm\": (\"premise\", \"hypothesis\"),\n",
+    "    \"mrpc\": (\"sentence1\", \"sentence2\"),\n",
+    "    \"qnli\": (\"question\", \"sentence\"),\n",
+    "    \"qqp\": (\"question1\", \"question2\"),\n",
+    "    \"rte\": (\"sentence1\", \"sentence2\"),\n",
+    "    \"sst2\": (\"sentence\", None),\n",
+    "    \"stsb\": (\"sentence1\", \"sentence2\"),\n",
+    "    \"wnli\": (\"sentence1\", \"sentence2\"),\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "xbqtC4MrIrJO"
+   },
+   "source": [
+    "We can double check it does work on our current dataset:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "19GG646uIrJO",
+    "outputId": "0cb4a520-817e-4f92-8de8-bb45df367657",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "sentence1_key, sentence2_key = task_to_keys[task]\n",
+    "\n",
+    "if sentence2_key is None:\n",
+    "    print(f\"Sentence: {dataset['train'][0][sentence1_key]}\")\n",
+    "else:\n",
+    "    print(f\"Sentence 1: {dataset['train'][0][sentence1_key]}\")\n",
+    "    print(f\"Sentence 2: {dataset['train'][0][sentence2_key]}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "2C0hcmp9IrJQ"
+   },
+   "source": [
+    "We can then write the function that will preprocess our samples. We just feed them to the `tokenizer` with the three arguments.`truncation=True` will ensure that an input longer than maximum length will be truncated to the maximum length. `max_length=max_seq_length` sets the maximum length of a sequence.\n",
+    "\n",
+    "**Important: since we will use packing later, we don't want to perform any padding in the tokenizer.**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "vc0BSBLIIrJQ",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# no padding for packing\n",
+    "def preprocess_function(examples):\n",
+    "    if sentence2_key is None:\n",
+    "        return tokenizer(examples[sentence1_key], truncation=True, max_length=max_seq_length)\n",
+    "    \n",
+    "    return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True, max_length=max_seq_length)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "0lm8ozrJIrJR"
+   },
+   "source": [
+    "This function works with one or several examples. In the case of several examples, the tokenizer will return a list of lists for each key:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "-b70jh26IrJS",
+    "outputId": "acd3a42d-985b-44ee-9daa-af5d944ce1d9",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "preprocess_function(dataset['train'][:5])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "zS-6iXTkIrJT"
+   },
+   "source": [
+    "To apply this function on all the sentences (or pairs of sentences) in our dataset, we just use the `map` method of our `dataset` object we created earlier. This will apply the function on all the elements of all the splits in `dataset`, so our training, validation and testing data will be preprocessed in one single command."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "DDtsaJeVIrJT",
+    "outputId": "aa4734bf-4ef5-4437-9948-2c16363da719",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "encoded_dataset = dataset.map(preprocess_function, batched=True)\n",
+    "len(encoded_dataset['train'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "voWiw8C7IrJV"
+   },
+   "source": [
+    "Even better, the results are automatically cached by the 🤗 Datasets library to avoid spending time on this step the next time you run your notebook. The 🤗 Datasets library is normally smart enough to detect when the function you pass to map has changed (and thus requires to not use the cache data). For instance, it will properly detect if you change the task in the first cell and rerun the notebook. 🤗 Datasets warns you when it uses cached files, you can pass `load_from_cache_file=False` in the call to `map` to not use the cached files and force the preprocessing to be applied again.\n",
+    "\n",
+    "Note that we passed `batched=True` to encode the texts by batches together. This is to leverage the full benefit of the fast tokenizer we loaded earlier, which will use multi-threading to treat the texts in a batch concurrently."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Packing the dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To implement packing, we need to pack our dataset first. Each new element will be a \"pack\" containing at most `max_seq_per_pack` sequences."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "max_seq_per_pack = 6"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We also define the number of labels in our dataset, `sst2` is a single_label task: it will contain one true class for each example."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "num_labels = 3 if task.startswith(\"mnli\") else 1 if task==\"stsb\" else 2\n",
+    "problem_type = 'single_label_classification'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Packing algorithm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In order to pack efficiently, we will use a histogram-based algorithm: shortest-pack-first histogram packing (SPFHP) presented in the [blog post](https://www.graphcore.ai/posts/introducing-packed-bert-for-2x-faster-training-in-natural-language-processing) adapted from the [blog code](https://github.com/graphcore/tutorials/tree/master/blogs_code/packedBERT). The full process of packing the dataset consists of four steps:\n",
+    "\n",
+    "1. Create a histogram of the sequence lengths of the dataset.\n",
+    "2. Generate the 'strategy' for the dataset using one of the state-of-the-art packing algorithms, which maps out the order and indices of the sequences that need to be packed together.\n",
+    "3. Use this strategy to create the actual dataset, concatenating the tokenized features together for each column in the dataset, including the labels.\n",
+    "4. Finally, pass these new columns into a custom PyTorch dataset, ready to be passed to the PopTorch dataloader!\n",
+    "\n",
+    "These steps have been simplified through the easy-to-use `utils.packing` available in Graphcore Optimum. You can simply generate the packed dataset after the usual tokenization and preprocessing by passing all necessary packing configuration to the `PackedDatasetCreator` class, and generate the ready-to-use PyTorch dataset with `.create()`.\n",
+    "\n",
+    "Within the function, there are some column names used by default. The expected default columns for text classification include:\n",
+    "* `input_ids`\n",
+    "* `attention_mask`\n",
+    "* `token_type_ids`\n",
+    "* `labels`\n",
+    "\n",
+    "These should all be generated automatically when tokenizing any classification dataset for BERT. However, the labels key, as it is not encoded, may have a different name. For this dataset, the column key for the labels for this dataset is `label`, since the dataset creator expects `labels`, we can pass this to the argument `custom_label_key`, so the class can find our labels. \n",
+    "\n",
+    "The `PackedDatasetCreator` requires different instantiations for different datasets, so it must be called separately for each of our dataset splits. We can set either `training`, `validation` or `inference` to `True` as needed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from utils.packing.dataset_creator import PackedDatasetCreator\n",
+    "\n",
+    "train_data_packer = PackedDatasetCreator(\n",
+    "    tokenized_dataset = encoded_dataset['train'],\n",
+    "    max_sequence_length = max_seq_length,\n",
+    "    max_sequences_per_pack = max_seq_per_pack,\n",
+    "    training = True,\n",
+    "    num_labels = num_labels,\n",
+    "    problem_type = problem_type,\n",
+    "    algorithm = 'SPFHP',\n",
+    "    custom_label_key = 'label'\n",
+    ")\n",
+    "\n",
+    "val_data_packer = PackedDatasetCreator(\n",
+    "    tokenized_dataset = encoded_dataset['validation'],\n",
+    "    max_sequence_length = max_seq_length,\n",
+    "    max_sequences_per_pack = max_seq_per_pack,\n",
+    "    validation = True,\n",
+    "    num_labels = num_labels,\n",
+    "    problem_type = problem_type,\n",
+    "    algorithm = 'SPFHP',\n",
+    "    custom_label_key = 'label'\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This will create the strategy and initialise the necessary parameters for packing the dataset. We can see that the ideal speed-up we have achieved is approximately 5.15x the original dataset, which corresponds directly to the average packing factor: the average number of sequences within one pack.\n",
+    "\n",
+    "The `PackedDatasetCreator` class also has some other features we do not use here for training, such as `pad_to_global_batch_size`, a feature useful for performing batched inference on a large samples when we do not want to lose any of the samples when creating data iterators using the `poptorch.Dataloader`, it applies 'vertical' padding to the dataset, adding filler rows to bring the dataset up to a value divisible by the global batch size, and allows for the largest possible batch sizes to be used without any loss of data."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can also view the histogram generated in the first step of the packing process, to observe whether the distribution of sequence lengths in the dataset will benefit from packing - as a general rule, as long as the average length of the sequences in the dataset is 50% or less of the maximum sequence length, packing will offer at least a 2x throughput benefit, in other words: `throughput_increase ≈ max_seq_len/mean_seq_len`\n",
+    "\n",
+    "Many datasets have distributions with much smaller average lengths, and will benefit much more. We can easily observe this distribution by retrieving and plotting the histogram from the data class:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "train_histogram = train_data_packer.histogram\n",
+    "\n",
+    "plt.hist(train_histogram, bins = [k for k in range(0,max_seq_length,10)]) \n",
+    "plt.title(\"Sequence length histogram\") \n",
+    "plt.xlabel('Sequence lengths')\n",
+    "plt.ylabel('Frequency')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we need to create the actual packed dataset, this is the 3rd step of the packing process outlined above.\n",
+    "\n",
+    "In this stage, we take the strategy for mapping the sequences by size into 'packs' that was generated by the packing algorithm, and use this to extract the sequences from the tokenized dataset, inserting them into packs for each column in the dataset. Any remaining space in a pack after the sequences have been concatenated is padded to bring all sequences up to the maximum sequence length.\n",
+    "\n",
+    "Some key features unique to packed datasets are worth mentioning here:\n",
+    "\n",
+    "- A specific `attention_mask` is generated: It contains a unique index for each sequence of the pack and `0` for the remaining padding tokens. This, essentially, tells the model where to \"look\" from the perspective of a single token, ignoring any encoded information (such as a different sequence) that is not relevant to that token.\n",
+    "    - Example of 3 sequences: `attention_mask = [1,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,0,...,0,1,2,3]`\n",
+    "\n",
+    "\n",
+    "- The [CLS] tokens of each sequence must be moved to the end of the pack.\n",
+    "    - For instance: `[CLS,a,b,c] + [CLS, d,e,f] + [CLS, g,h,i] -> [a,b,c,d,e,f,g,h,i,...,CLS,CLS,CLS]`\n",
+    "    \n",
+    "\n",
+    "- The `position_ids` of a pack contain the concatenated `position_ids` of each sequences \n",
+    "    - For instance given 3 sequences: `[0,1,2,3,4] + [0,1,2,3] + [0,1,2] -> [1,2,3,4,1,2,3,1,2,...,0,0,0]` (note: the CLS tokens position id '0' are also moved the end of the pack)\n",
+    "    \n",
+    "- `labels` and `token_type_ids` are also packed to correspond to the `input_ids` pack.\n",
+    "\n",
+    "\n",
+    "To create a dataloader-ready packed dataset, all you need to do is call the `create()` method:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true,
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "packed_train_dataset = train_data_packer.create()\n",
+    "packed_val_dataset = val_data_packer.create()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's visualize one sample of the new `packed_train_dataset`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true,
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "packed_train_dataset[133]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "545PP3o8IrJV"
+   },
+   "source": [
+    "## Fine-tuning the model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "FBiW8UpKIrJW"
+   },
+   "source": [
+    "Now that our data is ready, we can download the pretrained model and fine-tune it."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Implement Packed BERT\n",
+    "\n",
+    "A few model modifications are required to make packing work with BERT.\n",
+    "We extend the existing class `BertForSequenceClassification` to `PipelinedPackedBertForSequenceClassification` which incorporates the required changes to the pooler and the model output. The crux of these changes is to modify the generic sequence classification model to handle 'unpacking' multiple sequences in the output stage, treating them as a larger batch size for classification, as well as masking any padding created by packing.\n",
+    "\n",
+    "First let's load a default BERT configuration using `AutoConfig`. The config includes a new parameter we must set, `max_sequences_per_pack`, this informs the model of the maximum number of sequences it will need to 'unpack' in the model output. It also allows us to clearly define the `num_labels` and `problem_type` for this model.\n",
+    "\n",
+    "The problem type is essential to define here, as switching between methods used by different types of classification requires it within the custom model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from transformers import AutoConfig\n",
+    "\n",
+    "config = AutoConfig.from_pretrained(model_checkpoint)\n",
+    "config.max_sequences_per_pack = max_seq_per_pack\n",
+    "config.num_labels = num_labels\n",
+    "config.problem_type = problem_type"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we can instantiate the model class with the config, loading the weights from the model checkpoint."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true,
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import numpy as np\n",
+    "torch.manual_seed(43)\n",
+    "np.random.seed(43)\n",
+    "\n",
+    "from models.modeling_bert_packed import PipelinedPackedBertForSequenceClassification\n",
+    "\n",
+    "model = PipelinedPackedBertForSequenceClassification.from_pretrained(\n",
+    "    model_checkpoint, config=config).train()\n",
+    "\n",
+    "print(config)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "CczA5lJlIrJX"
+   },
+   "source": [
+    "The warning is telling us we are throwing away some weights and randomly initializing others. This is absolutely normal in this case, because we are removing the head used to pretrain the model on a masked language modeling objective and replacing it with a new head for which we don't have pretrained weights, so the library warns us we should fine-tune this model before using it for inference, which is exactly what we are going to do."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can first test the model on CPU and observe that the output logits have now the size [batch_size x max_seq_per_pack, 2] = [12, 2] with this notebook default values."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from transformers.data.data_collator import default_data_collator\n",
+    "import torch\n",
+    "\n",
+    "model.float()\n",
+    "loader = torch.utils.data.DataLoader(packed_train_dataset,\n",
+    "                             batch_size=micro_batch_size,\n",
+    "                             shuffle=True,\n",
+    "                             drop_last=True,\n",
+    "                             collate_fn=default_data_collator)\n",
+    "data = next(iter(loader))\n",
+    "outputs = model(**data)\n",
+    "print(outputs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now let's prepare the model for IPU"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First, we set the model in half precision:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "model.half()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For validation, we need to define a function to compute the metrics from the predictions, which will just use the `metric` we loaded earlier, the only preprocessing we have to do is to take the argmax of our predicted logits (our just squeeze the last axis in the case of STS-B). To ignore the `-100` labels from uncomplete packs, we use a boolean mask."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "metric_name = \"pearson\" if task == \"stsb\" else \"matthews_correlation\" if task == \"cola\" else \"accuracy\"\n",
+    "model_name = model_checkpoint.split(\"/\")[-1]\n",
+    "\n",
+    "def compute_metrics(eval_pred):\n",
+    "    predictions, labels = eval_pred\n",
+    "    \n",
+    "#   Remove the padding labels\n",
+    "    mask = (labels != -100)\n",
+    "    labels = labels[mask]\n",
+    "    \n",
+    "    if task != \"stsb\":\n",
+    "        predictions = np.argmax(predictions, axis=-1)\n",
+    "    else:\n",
+    "        predictions = predictions[:, 0]\n",
+    "    \n",
+    "    predictions = predictions[mask]\n",
+    "    \n",
+    "    return metric.compute(predictions=predictions, references=labels)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "_N8urzhyIrJY"
+   },
+   "source": [
+    "Next, we need to define the `IPUConfig`, which is a class that specifies attributes and configuration parameters to compile and put the model on the device. We initialize it with one config name or path, which we set earlier. Then we use it to set the mode attribute `model.ipu_config` "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from optimum.graphcore import IPUConfig, IPUTrainer, IPUTrainingArguments\n",
+    "\n",
+    "ipu_config = IPUConfig.from_pretrained(\n",
+    "    ipu_config_name,\n",
+    "    executable_cache_dir = executable_cache_dir,\n",
+    "    gradient_accumulation_steps=gradient_accumulation_steps,\n",
+    "    replication_factor=1,\n",
+    "    device_iterations = device_iterations,\n",
+    "    inference_device_iterations= 16,\n",
+    "    inference_replication_factor=1\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The IPUTrainingArguments define any custom parameter modification we want to do, such as the initial learning rate for the model. It also allows other options, such as dataloader paramters, micro batch sizes and an automatic push to the Huggingface Hub (if credentials were set up earlier) to happen at given intervals.\n",
+    "\n",
+    "These arguments are passed to the `IPUTrainer` which wraps the model training and evaluation process into a simple single-line process, doing all of the heavy lifting for us regarding training and evaluation loops, device assignment, optimiser definition, dataloading etc.\n",
+    "\n",
+    "Note that only some arbitrary hyperparameter tuning was performed for this task. Other tasks and datasets may require further tuning to get the most optimal results."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true,
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from transformers import default_data_collator\n",
+    "\n",
+    "args = IPUTrainingArguments(\n",
+    "    \"./\"+f\"{model_name}-{task}\",\n",
+    "    num_train_epochs=2,\n",
+    "    per_device_train_batch_size=micro_batch_size,\n",
+    "    per_device_eval_batch_size=2,\n",
+    "    learning_rate=9e-5,\n",
+    "    warmup_ratio=0.1,\n",
+    "    weight_decay=0,\n",
+    "    lr_scheduler_type = \"cosine\",\n",
+    "    metric_for_best_model=metric_name,\n",
+    "    dataloader_drop_last=True,\n",
+    "    # dataloader_mode=\"async_rebatched\",\n",
+    "    logging_steps=1,\n",
+    "    pod_type=pod_type,\n",
+    "    gradient_accumulation_steps=gradient_accumulation_steps,\n",
+    "    push_to_hub=True\n",
+    ")\n",
+    "\n",
+    "\n",
+    "trainer = IPUTrainer(\n",
+    "    model,\n",
+    "    ipu_config,\n",
+    "    args,\n",
+    "    train_dataset=packed_train_dataset,\n",
+    "    eval_dataset=packed_val_dataset,\n",
+    "    data_collator=default_data_collator,\n",
+    "    compute_metrics=compute_metrics\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then, to train the model we can simply call the `train()` method:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "***About the performance:*** `IPUTrainer` doesn't take into account that we have packed data samples when computing the speed metrics. It treats a 'sample' as a single input to the model, i.e. one **pack**.\n",
+    "\n",
+    "So the actual throughput estimation can be obtained by multiplying the `samples_per_second` by the average packing factor (the average number of samples per pack) of the dataset. These were obtained in the `packing_algorithm` section: `5.15` for `sst2` training set and `5.77` for validation set."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, we can evaluate the model by simply calling the `evaluate()` method:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trainer.evaluate()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To see how your model fared you can compare it to the [GLUE Benchmark leaderboard](https://gluebenchmark.com/leaderboard).\n",
+    "\n",
+    "You can now upload the result of the training to the Hub if you successfully logged in at the beginning of this notebook, just execute this instruction:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# trainer.push_to_hub()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can also save the model locally:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "saved_model_checkpoint = Path(os.getenv('CHECKPOINT_DIR', '/tmp/')) + f\"{model_name}-{task}\"\n",
+    "trainer.save_model(saved_model_checkpoint)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You have now successfully fine-tuned and evaluated your speed-optimised model for text classification using packing!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Fast batched inference"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Packing can also be used for inference, particularly for performing inference for workloads. This section demonstrates how to perform faster, batched inference with a large number of samples using a super-easy custom pipeline which batches and packs your input data, performs inference and returns postprocessed predictions. \n",
+    "\n",
+    "For the pipeline, we need to import it, and initialise a few essential parameters.\n",
+    "\n",
+    "The `model` is the model checkpoint, we are going to use the locally saved checkpoint generated from training `sst2`. The `executable_cache_dir`, `problem_type`, `max_seq_length` must be specified. To return predictions organised by class names, the class names for your output must be passed to `label_categories`. If you are loading a saved model without a pre-trained tokenizer saved in the checkpoint folder, it will be loaded automatically from `bert-base-uncased`, if you wish to load a different pre-trained tokenizer, you can specify this by passing the `pretrained_tokenizer` argument with the name of your tokenizer to the `PackedBertTextClassificationPipeline`.\n",
+    "\n",
+    "The pipeline will automatically determine your model's IPU config, given that the checkpoint was trained using Optimum Graphcore, which will be the case for the model fine-tuned in this notebook.\n",
+    "\n",
+    "In this example, we pre-load the IPUConfig and modify some of the default parameters to get the best performance out of inference and leverage the benefits of IPU parallelism. The micro-batch size can also be specified, for which the default is 1.\n",
+    "\n",
+    "When training, the packing factor affects the convergence the same way as a large increase in batch size would do. However, for inference, we are free to use a bigger packing factor to speed it up. Let's try it with `max_seq_per_pack = 12`.\n",
+    "\n",
+    "**Note:** Packing brings huge benefits for performing inference on large amounts of data. For small scale inference tasks, such as those which more suit sequential inference on a single un-batched input, the generic Optimum Graphcore `TextClassificationPipeline` may be prefered. This won't affect fine-tuning, the weights generated from fine-tuning using packing will work just the same!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Lets initialise the `PackedBertTextClassificationPipeline`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from pipeline.packed_bert import PackedBertTextClassificationPipeline\n",
+    "from optimum.graphcore import IPUConfig\n",
+    "\n",
+    "model = saved_model_checkpoint  \n",
+    "# model = 'your_username/{model_name}-{task}' # uncomment this and use your username to load from Hugging Face Hub\n",
+    "\n",
+    "inference_boosted_ipu_config = IPUConfig.from_pretrained(model, \n",
+    "        inference_device_iterations=32,\n",
+    "        inference_replication_factor=4,\n",
+    "        ipus_per_replica=1,\n",
+    "        layers_per_ipu=[12]\n",
+    "    )\n",
+    "\n",
+    "pipeline = PackedBertTextClassificationPipeline(\n",
+    "    model = model,\n",
+    "    executable_cache_dir = executable_cache_dir,\n",
+    "    problem_type='single_label_classification',\n",
+    "    max_seq_per_pack=12,\n",
+    "    max_seq_length=max_seq_length,\n",
+    "    ipu_config=inference_boosted_ipu_config,\n",
+    "    micro_batch_size=8,\n",
+    "    label_categories=['positive','negative']\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The pipeline expects a **list of strings** directly passed to it. There is no need to tokenize, preprocess, pack or postprocess the data to use the inference pipeline.\n",
+    "\n",
+    "As a test, we can load the entire `sst2` dataset and perform packed inference using `.predict()` on the text column to generate predictions. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import datasets\n",
+    "dataset = datasets.load_dataset('sst2')\n",
+    "preds = pipeline.predict(dataset['train']['sentence'])\n",
+    "\n",
+    "print(preds.keys())\n",
+    "print(f\"Number of predictions: {len(preds['predictions'])}\")\n",
+    "print(f\"Preprocessing time: {preds['preprocessing_time']}s\")\n",
+    "print(f\"Postprocessing time: {preds['postprocessing_time']}s\")\n",
+    "print(f\"Throughput: {preds['throughput']}samples/s\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "There is minimal overhead from tokenizing and packing the dataset, but the speed benefits are evident. Running the above pipeline, we achieve a throughput approximately 35000 samples per second, demonstrating the huge time benefit you can achieve by using packing!"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "name": "Text Classification on GLUE",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/packed-bert/pipeline/__init__.py b/packed-bert/pipeline/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/packed-bert/pipeline/packed_bert.py b/packed-bert/pipeline/packed_bert.py
new file mode 100644
index 0000000..cdb2679
--- /dev/null
+++ b/packed-bert/pipeline/packed_bert.py
@@ -0,0 +1,440 @@
+import logging
+import time
+from typing import Dict, List
+
+import numpy as np
+import torch
+from datasets import Dataset
+
+import poptorch
+from models.modeling_bert_packed import (
+    PipelinedPackedBertForQuestionAnswering,
+    PipelinedPackedBertForSequenceClassification,
+)
+from optimum.graphcore import IPUConfig
+from scipy.special import softmax
+from transformers import AutoConfig, AutoTokenizer
+from transformers.data.data_collator import default_data_collator
+from utils.packing.dataset_creator import PackedDatasetCreator
+from utils.packing.dataset_templates import PackedQuestionAnsweringDataset
+from utils.packing.qa_utils import postprocess_packed_qa_predictions, preprocess_packed_qa
+
+
+logger = logging.getLogger("")
+
+
+def get_poplar_executor(model, ipu_config, batch, detach=False):
+    ipu_options = ipu_config.to_options(for_inference=True)
+    model.ipu_config = ipu_config
+
+    if isinstance(model, poptorch.PoplarExecutor):
+        print("Model already wrapped - nothing to do.")
+        return model
+    try:
+        model.deparallelize()
+    except:
+        pass
+
+    ipu_model = poptorch.inferenceModel(model.eval().parallelize(), ipu_options)
+
+    ipu_model.compile(**batch)
+
+    if detach:
+        ipu_model.detachFromDevice()
+
+    return ipu_model
+
+
+def prepare_inference_dataloader(ipu_config, dataset, batch_size, mode="async_rebatched"):
+    return poptorch.DataLoader(
+        ipu_config.to_options(for_inference=True),
+        dataset,
+        batch_size=batch_size,
+        shuffle=False,  # Must be false, retained order important for batched inference
+        drop_last=False,  # Must be false, we pad up to global batch size in inference pipeline to avoid any division error
+        mode=mode,
+        collate_fn=default_data_collator,
+    )
+
+
+class PackedBertTextClassificationPipeline:
+    """
+    Packed classification pipeline:
+
+    Batched inference pipeline for packed BERT text classification with multi/single label. Wraps all preprocessing and model for inference, executes on text inputs in format `questions, contexts` of any size, proceeds to batch according to checkpoint or as per custom IPU configs, and packs data. Performs inference on PipelinedPackedBertForSequenceClassification. Returns postprocessed predictions in same order as input data.
+    """
+
+    def __init__(
+        self,
+        model,
+        executable_cache_dir: str = "./exe_cache",
+        problem_type: str = "single_label_classification",
+        max_seq_per_pack: int = 12,
+        max_seq_length: int = 384,
+        ipu_config: IPUConfig = None,
+        micro_batch_size: int = 1,
+        dataloader_mode: str = "async_rebatched",
+        detach_model_after_compile: bool = False,
+        pretrained_tokenizer: str = "bert-base-uncased",
+        label_categories: List = [],
+    ) -> None:
+        self.model_ckpt = model
+        self.problem_type = problem_type
+        self.max_seq_per_pack = max_seq_per_pack
+        self.max_seq_length = max_seq_length
+
+        self.pretrained_tokenizer = pretrained_tokenizer
+        self.dataloader_mode = dataloader_mode
+        self.detach_model_after_post_compile = detach_model_after_compile
+        self.executable_cache_dir = executable_cache_dir
+
+        self.micro_batch_size = micro_batch_size
+        self.sentence_2_key = None
+        self.label_categories = label_categories
+
+        if not ipu_config:
+            try:
+                logger.info("Attempting loading IPUConfig from model checkpoint:")
+                self.ipu_config = IPUConfig.from_pretrained(
+                    self.model_ckpt, executable_cache_dir=self.executable_cache_dir
+                )
+            except:
+                logger.warn(
+                    "Loading default config: 'Graphcore/bert-base-uncased' - because no IPUConfig found in model folder."
+                )
+                self.ipu_config = IPUConfig.from_pretrained(
+                    "Graphcore/bert-base-uncased", executable_cache_dir=self.executable_cache_dir
+                )
+        else:
+            self.ipu_config = ipu_config
+            if self.executable_cache_dir is not None:
+                self.ipu_config.executable_cache_dir = self.executable_cache_dir 
+
+        self.gbs = (
+            self.ipu_config.inference_device_iterations
+            * self.ipu_config.inference_replication_factor
+            * self.micro_batch_size
+        )
+
+        try:
+            logger.info("Attempting loading tokenizer from model checkpoint")
+            self.tokenizer = AutoTokenizer.from_pretrained(self.model_ckpt, use_fast=True)
+        except:
+            logger.warn("Loading tokenizer from defined because no pretrained tokenizer found in model folder.")
+            self.tokenizer = AutoTokenizer.from_pretrained(self.pretrained_tokenizer, use_fast=True)
+
+        config = AutoConfig.from_pretrained(self.model_ckpt)
+        config.max_sequences_per_pack = self.max_seq_per_pack
+        config.problem_type = self.problem_type
+
+        self.model = (
+            PipelinedPackedBertForSequenceClassification(config).from_pretrained(self.model_ckpt, config=config).half()
+        )
+
+        compile_data = Dataset.from_dict({"text": ["I am a dummy sentence for compilation."]})
+
+        enc_compile_data = compile_data.map(self.preprocess_function, batched=True)
+
+        pck_compile_data = PackedDatasetCreator(
+            tokenized_dataset=enc_compile_data,
+            max_sequence_length=self.max_seq_length,
+            max_sequences_per_pack=self.max_seq_per_pack,
+            inference=True,
+            pad_to_global_batch_size=True,
+            global_batch_size=self.gbs,
+            problem_type=self.problem_type,
+        ).create()
+
+        c_dataloader = prepare_inference_dataloader(
+            self.ipu_config, pck_compile_data, self.micro_batch_size, self.dataloader_mode
+        )
+
+        c_batch = next(iter(c_dataloader))
+
+        # Remove custom column for compile - autoignored in optimum, manually ignored in predict
+        c_batch.pop("example_ids", None)
+
+        self.poplar_executor = get_poplar_executor(self.model, self.ipu_config, c_batch)
+
+    def preprocess_function(self, examples):
+        if self.sentence_2_key:
+            return self.tokenizer(
+                examples["text"], examples["text_2"], truncation=True, max_length=self.max_seq_length
+            )
+        else:
+            return self.tokenizer(examples["text"], truncation=True, max_length=self.max_seq_length)
+
+    def postprocess_preds(self, logits, ids):
+        ids = torch.concat(ids)
+        mask = ids != -100
+        ids = ids[mask]
+
+        if self.problem_type == "multi_label_classification":
+            pred_scores = softmax(torch.concat(logits)[mask, :].numpy().astype("float32"), axis=1)
+        if self.problem_type == "single_label_classification":
+            pred_scores = softmax(torch.concat(logits)[mask, :].numpy().astype("float32"), axis=1)
+
+        pred_scores = pred_scores[np.argsort(ids)]
+
+        return pred_scores
+
+    def predict(self, sentence_1, sentence_2=None):
+        self.sentence_2_key = sentence_2
+
+        prep_st = time.time()
+
+        data_dict = {"text": sentence_1}
+        if sentence_2:
+            data_dict["text_2"] = sentence_2
+
+        dataset = Dataset.from_dict(data_dict)
+        enc_data = dataset.map(self.preprocess_function, batched=True)
+
+        # Pack the inputs
+        packed_data = PackedDatasetCreator(
+            tokenized_dataset=enc_data,
+            max_sequence_length=self.max_seq_length,
+            max_sequences_per_pack=self.max_seq_per_pack,
+            inference=True,
+            pad_to_global_batch_size=True,
+            global_batch_size=self.gbs,
+            problem_type=self.problem_type,
+        ).create()
+
+        dataloader = prepare_inference_dataloader(
+            self.ipu_config, packed_data, self.micro_batch_size, self.dataloader_mode
+        )
+
+        example_ids = []
+        outputs = []
+
+        # Process the model to return logits
+        prep_time = time.time() - prep_st
+
+        model_st = time.time()
+        for batch in iter(dataloader):
+            logits = self.poplar_executor(
+                input_ids=batch["input_ids"],
+                attention_mask=batch["attention_mask"],
+                token_type_ids=batch["token_type_ids"],
+                position_ids=batch["position_ids"],
+            )
+
+            ids = batch["example_ids"]
+            outputs.append(logits.view(ids.shape[0], self.max_seq_per_pack, -1))
+            example_ids.append(ids)
+
+        model_en = time.time()
+        model_time = model_en - model_st
+        tput = len(sentence_1) / (model_time)
+
+        # Postprocess predictions to preserve order
+        post_st = time.time()
+        final_preds = self.postprocess_preds(outputs, example_ids)
+
+        if len(self.label_categories) == final_preds.shape[-1]:
+            final_preds = {k: dict(list(zip(self.label_categories, v))) for k, v in enumerate(final_preds)}
+        else:
+            final_preds = {{n: k[n] for n in k} for k in final_preds}
+
+        post_proc_time = time.time() - post_st
+
+        return {
+            "predictions": final_preds,
+            "throughput": tput,
+            "inference_total_time": model_time,
+            "preprocessing_time": prep_time,
+            "postprocessing_time": post_proc_time,
+        }
+
+
+class PackedBertQuestionAnsweringPipeline:
+    """
+    Packed Question-answering pipeline:
+
+    Batched inference pipeline for packed BERT question answering. Wraps all preprocessing and model for inference, executes on text inputs in format `questions, contexts` of any size, proceeds to batch according to checkpoint or as per custom IPU configs, and packs data. Performs inference on PipelinedPackedBertForQuestionAnswering. Returns postprocessed predictions in same order as input data.
+    """
+
+    def __init__(
+        self,
+        model,
+        executable_cache_dir: str = "./exe_cache",
+        problem_type: str = "question_answering",
+        max_seq_per_pack: int = 12,
+        max_seq_length: int = 384,
+        pretrained_tokenizer: str = "bert-base-uncased",
+        ipu_config: str = None,
+        micro_batch_size: int = 1,
+        dataloader_mode: str = "async_rebatched",
+        detach_model_after_compile: bool = False,
+    ) -> None:
+        self.problem_type = problem_type
+        self.max_seq_per_pack = max_seq_per_pack
+        self.max_seq_length = max_seq_length
+
+        self.model_ckpt = model
+        self.pretrained_tokenizer = pretrained_tokenizer
+        self.dataloader_mode = dataloader_mode
+        self.detach_model_after_post_compile = detach_model_after_compile
+        self.executable_cache_dir = executable_cache_dir
+        self.micro_batch_size = micro_batch_size
+
+        if not ipu_config:
+            try:
+                logger.info("Attempting loading IPUConfig from model checkpoint:")
+                self.ipu_config = IPUConfig.from_pretrained(
+                    self.model_ckpt, executable_cache_dir=self.executable_cache_dir
+                )
+            except:
+                logger.warn(
+                    "Loading default config: 'Graphcore/bert-base-uncased' - because no IPUConfig found in model folder."
+                )
+                self.ipu_config = IPUConfig.from_pretrained(
+                    "Graphcore/bert-base-uncased", executable_cache_dir=self.executable_cache_dir
+                )
+        else:
+            self.ipu_config = ipu_config
+            if self.executable_cache_dir is not None:
+                self.ipu_config.executable_cache_dir = self.executable_cache_dir 
+
+        self.gbs = (
+            self.ipu_config.inference_device_iterations
+            * self.ipu_config.inference_replication_factor
+            * self.micro_batch_size
+        )
+
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained(self.model_ckpt, use_fast=True)
+        except:
+            self.tokenizer = AutoTokenizer.from_pretrained(self.pretrained_tokenizer, use_fast=True)
+
+        config = AutoConfig.from_pretrained(self.model_ckpt)
+        config.max_sequences_per_pack = self.max_seq_per_pack
+        config.problem_type = self.problem_type
+
+        self.model = (
+            PipelinedPackedBertForQuestionAnswering(config).from_pretrained(self.model_ckpt, config=config).half()
+        )
+
+        compile_data = Dataset.from_dict(
+            {
+                "id": np.array([str(i) for i in range(self.gbs)]).astype("<U32"),
+                "question": ["Do trees have leaves in the wintertime?."] * self.gbs,
+                "context": [
+                    "Most trees leaves fall off after the autumn season. However, evergreen trees keep their leaves through winter."
+                ]
+                * self.gbs,
+            }
+        )
+
+        enc_compile_data = preprocess_packed_qa(
+            dataset=compile_data,
+            tokenizer=self.tokenizer,
+            question_key="question",
+            context_key="context",
+            answer_key="answer",
+            sequence_length=self.max_seq_length,
+            padding=True,  # only for compile, so we dont need to pack the dummy data
+            train=False,
+        )
+
+        packed_compile_data_pre = PackedDatasetCreator(
+            tokenized_dataset=enc_compile_data,
+            max_sequence_length=self.max_seq_length,
+            max_sequences_per_pack=self.max_seq_per_pack,
+            inference=True,
+            pad_to_global_batch_size=True,
+            global_batch_size=self.gbs,
+            problem_type=self.problem_type,
+        ).create()
+
+        packed_compile_data = Dataset.from_list(packed_compile_data_pre)
+        packed_compile_data = packed_compile_data.remove_columns(["offset_mapping", "example_ids"])
+
+        c_dataloader = prepare_inference_dataloader(
+            self.ipu_config, packed_compile_data, self.micro_batch_size, self.dataloader_mode
+        )
+
+        c_batch = next(iter(c_dataloader))
+        c_batch.pop("offset_mapping", None)
+        c_batch.pop("example_id", None)
+
+        self.poplar_executor = get_poplar_executor(self.model, self.ipu_config, c_batch)
+
+    def predict(self, questions, contexts):
+        prep_st = time.time()
+
+        dataset = Dataset.from_dict(
+            {
+                "id": np.array([str(i) for i in range(len(questions))]).astype("<U32"),
+                "question": questions,
+                "context": contexts,
+            }
+        )
+
+        enc_data = preprocess_packed_qa(
+            dataset=dataset,
+            tokenizer=self.tokenizer,
+            question_key="question",
+            context_key="context",
+            answer_key="answer",
+            sequence_length=self.max_seq_length,
+            padding=False,
+            train=False,
+        )
+
+        packed_data_pre = PackedDatasetCreator(
+            tokenized_dataset=enc_data,
+            max_sequence_length=self.max_seq_length,
+            max_sequences_per_pack=self.max_seq_per_pack,
+            inference=True,
+            pad_to_global_batch_size=True,
+            global_batch_size=self.gbs,
+            problem_type=self.problem_type,
+        ).create()
+
+        # Not the most efficient way...
+        packed_data = Dataset.from_list(packed_data_pre)
+        packed_data = packed_data.remove_columns(["offset_mapping", "example_ids"])
+        packed_data = PackedQuestionAnsweringDataset(
+            input_ids=packed_data["input_ids"],
+            attention_mask=packed_data["attention_mask"],
+            token_type_ids=packed_data["token_type_ids"],
+            position_ids=packed_data["position_ids"],
+            start_positions=None,
+            end_positions=None,
+            offset_mapping=None,
+            example_ids=None,
+        )
+
+        dataloader = prepare_inference_dataloader(
+            self.ipu_config, packed_data, self.micro_batch_size, self.dataloader_mode
+        )
+
+        outputs = []
+        prep_time = time.time() - prep_st
+
+        model_st = time.time()
+        for batch in iter(dataloader):
+            logits = self.poplar_executor(**batch)
+            outputs.append(torch.stack(logits))
+
+        model_en = time.time()
+        model_time = model_en - model_st
+        tput = len(questions) / (model_time)
+
+        post_st = time.time()
+        outputs = torch.cat(outputs, dim=1).numpy()
+        final_preds = postprocess_packed_qa_predictions(dataset, packed_data_pre, outputs)
+
+        formatted_predictions = [{"id": k, "prediction_text": v} for k, v in final_preds.items()]
+
+        post_proc_time = time.time() - post_st
+
+        return {
+            "predictions": formatted_predictions,
+            "throughput": tput,
+            "inference_total_time": model_time,
+            "preprocessing_time": prep_time,
+            "postprocessing_time": post_proc_time,
+        }
diff --git a/packed-bert/utils/__init__.py b/packed-bert/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/packed-bert/utils/packing/__init__.py b/packed-bert/utils/packing/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/packed-bert/utils/packing/algorithms.py b/packed-bert/utils/packing/algorithms.py
new file mode 100644
index 0000000..a9afe33
--- /dev/null
+++ b/packed-bert/utils/packing/algorithms.py
@@ -0,0 +1,225 @@
+# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+from collections import defaultdict
+
+import numpy as np
+
+from scipy import optimize, stats
+
+
+def add_pack(pack, count, tmp, final, limit, offset):
+    if len(pack) == limit or offset == 0:
+        final[offset].append((count, pack))
+    else:
+        tmp[offset].append((count, pack))
+
+
+def LPFHP(histogram, max_sequence_length, max_sequences_per_pack, distribute=True):
+    """Longest-pack-first histogram-packing."""
+    start = time.time()
+    reversed_histogram = np.flip(histogram)
+    # Initialize main strategy data dictionary.
+    # The key indicates how many tokens are left for full length.
+    # The value is a list of tuples, consisting of counts and respective packs.
+    # A pack is a (sorted) list of sequence length values that get concatenated.
+    tmp_strategies_per_length = defaultdict(list)
+    strategies_per_length = defaultdict(list)
+    if max_sequences_per_pack == "max":
+        max_sequences_per_pack = max_sequence_length
+    # Index i indicates here, how much space is left, due to reversed histogram
+    for i in range(max_sequence_length):
+        n_sequences_to_bin = reversed_histogram[i]
+        length_to_bin = max_sequence_length - i
+        offset = 0  # smallest possible offset for perfect fit
+        while n_sequences_to_bin > 0:
+            if (length_to_bin + offset) in tmp_strategies_per_length:
+                # extract worst pack that will get modified
+                n_sequences_to_pack, pack = tmp_strategies_per_length[length_to_bin + offset].pop()
+                # calculate how often the current sequence maximally fits in
+                repeat = min(1 + offset // length_to_bin, max_sequences_per_pack - len(pack))
+                # correct dependent on count
+                while n_sequences_to_bin // repeat == 0:
+                    repeat -= 1
+                if not distribute:
+                    repeat = 1
+                new_pack = pack + [length_to_bin] * repeat
+                count = min(n_sequences_to_pack, n_sequences_to_bin // repeat)
+                if n_sequences_to_pack > count:
+                    # old pack gets reduced
+                    n_sequences_to_pack -= count
+                    tmp_strategies_per_length[length_to_bin + offset].append((n_sequences_to_pack, pack))
+                    n_sequences_to_bin -= count * repeat
+                else:
+                    n_sequences_to_bin -= n_sequences_to_pack * repeat
+                add_pack(
+                    new_pack,
+                    count,
+                    tmp_strategies_per_length,
+                    strategies_per_length,
+                    max_sequences_per_pack,
+                    offset - (repeat - 1) * length_to_bin,
+                    max_sequence_length,
+                )
+                # clean up to speed up main key search
+                if not tmp_strategies_per_length[length_to_bin + offset]:
+                    tmp_strategies_per_length.pop(length_to_bin + offset)
+                # reset offset in case best fit changed
+                offset = 0
+            else:
+                offset += 1
+            # Does not fit anywhere. Create new pack.
+            if offset >= max_sequence_length - length_to_bin + 1:
+                # similar repetition but no dependence on pack.
+                repeat = min(max_sequence_length // length_to_bin, max_sequences_per_pack)
+                while n_sequences_to_bin // repeat == 0:
+                    repeat -= 1
+                if not distribute:
+                    repeat = 1
+                add_pack(
+                    [length_to_bin] * repeat,
+                    n_sequences_to_bin // repeat,
+                    tmp_strategies_per_length,
+                    strategies_per_length,
+                    max_sequences_per_pack,
+                    max_sequence_length - length_to_bin * repeat,
+                    max_sequence_length,
+                )
+                n_sequences_to_bin -= n_sequences_to_bin // repeat * repeat
+    # merge all strategies
+    for key in tmp_strategies_per_length:
+        strategies_per_length[key].extend(tmp_strategies_per_length[key])
+    # flatten strategies dictionary
+    strategy_set = []
+    strategy_repeat_count = []
+    for key in strategies_per_length:
+        for count, pack in strategies_per_length[key]:
+            pack.reverse()
+            strategy_set.append(pack)
+            strategy_repeat_count.append(count)
+
+    # Summarize efficiency of solution
+    duration = time.time() - start
+    sequence_lengths = np.arange(1, max_sequence_length + 1)
+    strategy_repeat_count = np.array(strategy_repeat_count)
+    n_strategies = len(strategy_set)
+    old_number_of_samples = histogram.sum()
+    new_number_of_samples = strategy_repeat_count.sum()
+    sequences = sum([count * len(pack) for count, pack in zip(strategy_repeat_count, strategy_set)])
+    total_tokens = max_sequence_length * new_number_of_samples
+    empty_tokens = sum(
+        [count * (max_sequence_length - sum(pack)) for count, pack in zip(strategy_repeat_count, strategy_set)]
+    )
+    efficiency = 100 - empty_tokens / total_tokens * 100
+    speedup_upper_bound = 1.0 / (
+        1 - (histogram * (1 - sequence_lengths / max_sequence_length)).sum() / old_number_of_samples
+    )
+
+    print(
+        f"Packing efficiency (fraction of real tokens): {efficiency:3.4f}\n",
+        f"Speed-up theoretical limit: {speedup_upper_bound:3.4f}\n",
+        f"Achieved speed-up over un-packed dataset: {old_number_of_samples/new_number_of_samples:3.5f}",
+        f"Runtime: Packed {old_number_of_samples} sequences in {duration:3.3f} seconds.",
+    )
+
+    return strategy_set, strategy_repeat_count  # =mixtures
+
+
+def SPFHP(histogram: np.ndarray, max_sequence_length: int, max_sequences_per_pack: int):
+    """Shortest-pack-first histogram-packing."""
+    start = time.time()
+    reversed_histogram = np.flip(histogram)
+    # Initialize main strategy data dictionary.
+    # The key indicates how many tokens are left for full length.
+    # The value is a list of tuples, consisting of counts and respective packs.
+    # A pack is a (sorted) list of sequence length values that get concatenated.
+    tmp_strategies_per_length = defaultdict(list)
+    strategies_per_length = defaultdict(list)
+    # Index i indicates here, how much space is left, due to reversed histogram
+    for i in range(max_sequence_length):
+        n_sequences_to_bin = reversed_histogram[i]
+        length_to_bin = max_sequence_length - i
+        offset = i + 1  # largest possible offset
+        while n_sequences_to_bin > 0:
+            if (length_to_bin + offset) in tmp_strategies_per_length:
+                # extract shortest pack that will get modified
+                n_sequences_to_pack, pack = tmp_strategies_per_length[length_to_bin + offset].pop()
+                new_pack = pack + [length_to_bin]
+                count = min(n_sequences_to_pack, n_sequences_to_bin)
+                if n_sequences_to_pack > n_sequences_to_bin:
+                    # old pack gets reduced
+                    n_sequences_to_pack -= n_sequences_to_bin
+                    tmp_strategies_per_length[length_to_bin + offset].append((n_sequences_to_pack, pack))
+                    n_sequences_to_bin = 0
+                else:
+                    n_sequences_to_bin -= n_sequences_to_pack
+                add_pack(
+                    new_pack, count, tmp_strategies_per_length, strategies_per_length, max_sequences_per_pack, offset
+                )
+                # clean up to speed up main key search
+                if not tmp_strategies_per_length[length_to_bin + offset]:
+                    tmp_strategies_per_length.pop(length_to_bin + offset)
+            else:
+                offset -= 1
+            # Does not fit anywhere. Create new pack.
+            if offset < 0:
+                add_pack(
+                    [length_to_bin],
+                    n_sequences_to_bin,
+                    tmp_strategies_per_length,
+                    strategies_per_length,
+                    max_sequences_per_pack,
+                    i,
+                )
+                n_sequences_to_bin = 0
+    # merge all strategies
+    for key in tmp_strategies_per_length:
+        strategies_per_length[key].extend(tmp_strategies_per_length[key])
+    # flatten strategies dictionary
+    strategy_set = []
+    strategy_repeat_count = []
+    for key in strategies_per_length:
+        for count, pack in strategies_per_length[key]:
+            pack.reverse()
+            strategy_set.append(pack)
+            strategy_repeat_count.append(count)
+
+    # Summarize efficiency of solution
+    duration = time.time() - start
+    sequence_lengths = np.arange(1, max_sequence_length + 1)
+    strategy_repeat_count = np.array(strategy_repeat_count)
+    n_strategies = len(strategy_set)
+    old_number_of_samples = histogram.sum()
+    new_number_of_samples = strategy_repeat_count.sum()
+    sequences = sum([count * len(pack) for count, pack in zip(strategy_repeat_count, strategy_set)])
+    total_tokens = max_sequence_length * new_number_of_samples
+    empty_tokens = sum(
+        [count * (max_sequence_length - sum(pack)) for count, pack in zip(strategy_repeat_count, strategy_set)]
+    )
+    efficiency = 100 - empty_tokens / total_tokens * 100
+    speedup_upper_bound = 1.0 / (
+        1 - (histogram * (1 - sequence_lengths / max_sequence_length)).sum() / old_number_of_samples
+    )
+    packing_factor = sequences / sum(strategy_repeat_count)
+
+    print(
+        f"Packing efficiency (fraction of real tokens): {efficiency:3.4f}\n",
+        f"Speed-up theoretical limit: {speedup_upper_bound:3.4f}\n",
+        f"Achieved speed-up over un-packed dataset: {old_number_of_samples/new_number_of_samples:3.5f}\n",
+        f"Runtime: Packed {old_number_of_samples} sequences in {duration:3.3f} seconds\n",
+        f"Average packing factor: {packing_factor}",
+    )
+
+    return strategy_set, np.array(strategy_repeat_count)
diff --git a/packed-bert/utils/packing/dataset_creator.py b/packed-bert/utils/packing/dataset_creator.py
new file mode 100644
index 0000000..e3ba495
--- /dev/null
+++ b/packed-bert/utils/packing/dataset_creator.py
@@ -0,0 +1,310 @@
+# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+import logging
+import time
+
+import numpy as np
+from tqdm import tqdm
+
+from .algorithms import LPFHP, SPFHP
+from .dataset_templates import PackedClassificationDataset, PackedQuestionAnsweringDataset
+
+
+"""
+Currently enabled supported tasks:
+* Single label classification with BERT
+* Multi label classification with BERT
+* Question-answering with BERT (SQuAD)
+"""
+
+logger = logging.getLogger("packing")
+
+
+class PackedDatasetCreator:
+    def __init__(
+        self,
+        tokenized_dataset,
+        problem_type,
+        num_labels: int = None,
+        max_sequence_length: int = 384,
+        max_sequences_per_pack: int = 6,
+        training: bool = False,
+        validation: bool = False,
+        inference: bool = False,
+        algorithm: str = "SPFHP",
+        pad_to_global_batch_size: bool = False,
+        global_batch_size: int = None,
+        custom_label_key: str = "labels",
+    ) -> None:
+        # This list should contain all currently supported tasks (for BERT, currently)
+        supported_problem_types = ["single_label_classification", "multi_label_classification", "question_answering"]
+
+        self.max_seq_len = max_sequence_length
+        self.max_seq_per_pack = max_sequences_per_pack
+        self.num_labels = num_labels
+        self.training = training
+        self.validation = validation
+        self.inference = inference
+        self.algorithm = algorithm
+
+        # Verify the problem type
+        if problem_type in supported_problem_types:
+            self.problem_type = problem_type
+        else:
+            logger.error(
+                f"Unsupported problem type given - attempting to detect from number of labels (default 1, unless specifically passed). \
+                Pass one of the supported types: {supported_problem_types}."
+            )
+            raise Exception
+
+        # Verify the task
+        if not training and not validation and not inference:
+            logger.error(
+                "One of 'training', 'validation' or 'inference' must be set to True when calling PackedDatasetCreator."
+            )
+            raise Exception
+
+        # Verify num_labels if not inference
+        if inference:
+            logger.info("Inference mode has been set. This will override training/validation mode and ignore labels.")
+        else:
+            if num_labels == None:
+                logger.error(
+                    f'For validation (to evaluate) and training, num_labels must be passed to PackedDatasetCreator - num_labels got "None"!'
+                )
+                raise Exception
+
+        # Get the unpacked default data columns
+        self.unpacked_input_ids = tokenized_dataset["input_ids"]
+        self.unpacked_attention_mask = tokenized_dataset["attention_mask"]
+        self.unpacked_token_type_ids = tokenized_dataset["token_type_ids"]
+
+        # Get the strategy to pack the dataset using the algorithm
+        self.strategy = self.get_strategy()
+        total_num_packs = np.sum(self.strategy[1])
+
+        # Provide an option to pad the dataset to the given global batch size to avoid skipping samples
+        if pad_to_global_batch_size and global_batch_size:
+            if total_num_packs % global_batch_size != 0:
+                difference_to_batch_size = global_batch_size - (total_num_packs % global_batch_size)
+                total_num_packs += difference_to_batch_size
+
+        self.total_num_packs = total_num_packs
+
+        # Prepare the manually padded constant sized data
+        self.shift_cls_tokens = True
+        self.adjust_offset_positions = False
+
+        self.packed_input_ids = np.zeros((self.total_num_packs, self.max_seq_len), dtype=int)
+        self.packed_attention_mask = np.zeros((self.total_num_packs, self.max_seq_len), dtype=int)
+        self.packed_token_type_ids = np.zeros((self.total_num_packs, self.max_seq_len), dtype=int)
+        self.packed_position_ids = np.zeros((self.total_num_packs, self.max_seq_len), dtype=int)
+
+        # Task-specific dataset categories and dataset class definitions
+        if problem_type == "single_label_classification":
+            self.dataset_class = PackedClassificationDataset
+
+            if not self.inference:
+                self.unpacked_labels = tokenized_dataset[custom_label_key]
+                self.packed_labels = -100 * np.ones((self.total_num_packs, self.max_seq_per_pack), dtype=int)
+                self.packed_example_ids = None
+            else:
+                self.packed_labels = None
+                self.packed_example_ids = -100 * np.ones((self.total_num_packs, self.max_seq_per_pack), dtype=int)
+
+        elif problem_type == "multi_label_classification":
+            self.dataset_class = PackedClassificationDataset
+
+            if not self.inference:
+                self.unpacked_labels = tokenized_dataset[custom_label_key]
+                self.packed_labels = -100 * np.ones(
+                    (self.total_num_packs, self.max_seq_per_pack, self.num_labels), dtype=int
+                )
+                self.packed_example_ids = None
+            else:
+                self.packed_labels = None
+                self.packed_example_ids = -100 * np.ones((self.total_num_packs, self.max_seq_per_pack), dtype=int)
+
+        elif problem_type == "question_answering":
+            self.dataset_class = PackedQuestionAnsweringDataset
+            if self.training:
+                self.unpacked_start_positions = tokenized_dataset["start_positions"]
+                self.unpacked_end_positions = tokenized_dataset["end_positions"]
+                self.packed_start_positions = -100 * np.ones((self.total_num_packs, self.max_seq_per_pack), dtype=int)
+                self.packed_end_positions = -100 * np.ones((self.total_num_packs, self.max_seq_per_pack), dtype=int)
+            else:
+                self.packed_start_positions = None
+                self.packed_end_positions = None
+
+            if self.validation or self.inference:
+                self.unpacked_example_ids = tokenized_dataset["example_id"]
+                self.unpacked_offset_mapping = tokenized_dataset["offset_mapping"]
+                self.packed_example_ids = np.zeros((self.total_num_packs, self.max_seq_per_pack), dtype="<U32")
+                self.packed_offset_mapping = -np.ones((self.total_num_packs, self.max_seq_len, 2), dtype=int)
+            else:
+                self.packed_example_ids = None
+                self.packed_offset_mapping = None
+
+            self.adjust_offset_positions = True
+            self.shift_cls_tokens = False
+
+    # This function generates the histogram to be used by the histogram-based packing algorithm
+    def generate_histogram(self):
+        dataset_seq_lens = np.array([len(seq) for seq in self.unpacked_input_ids])
+        histogram = np.zeros(self.max_seq_len, dtype=np.int64)
+        seq_lens, counts = np.unique(dataset_seq_lens, return_counts=True)
+        histogram[seq_lens - 1] = counts
+        return histogram
+
+    # This function runs the algorithm on the histogram to obtain the packing strategy
+    def get_strategy(self):
+        self.histogram = self.generate_histogram()
+
+        if self.algorithm == "SPFHP":
+            strategy = SPFHP(self.histogram, self.max_seq_len, self.max_seq_per_pack)
+        elif self.algorithm == "LPFHP":
+            strategy = LPFHP(self.histogram, self.max_seq_len, self.max_seq_per_pack)
+        else:
+            logger.error("Algorithm type unsupported. Pass one of: LPFHP, SPFHP")
+            raise Exception
+
+        return strategy
+
+    # This function creates the strategy
+    def create(self):
+        strategy_set = self.strategy[0]
+        strategy_repeat_count = self.strategy[1]
+        skip_cls = int(self.shift_cls_tokens)
+
+        # Sort the sequences by length
+        dataset_seq_lens = np.array([len(seq) for seq in self.unpacked_input_ids])
+        len_sorted_seq_idxs = np.argsort(dataset_seq_lens)
+        len_sorted_seq_lens = dataset_seq_lens[len_sorted_seq_idxs]
+        sorted_seqs = np.stack((len_sorted_seq_lens, len_sorted_seq_idxs))
+
+        # Pack the data using the developed strategies
+        pack_index = 0
+
+        st = time.time()
+        for i in range(len(strategy_repeat_count)):
+            strategy = strategy_set[i]
+
+            # This is the offset we apply to the start positions to account for the positional change of the logits when unmasking the pack to extract a set of logits for each sequence in the pack
+            if self.adjust_offset_positions:
+                positions_offset = [sum(strategy[:n]) for n in range(len(strategy))]
+
+            for _ in range(strategy_repeat_count[i]):
+                ref_inds = []
+                for x in strategy:
+                    ref_ind = np.argwhere(sorted_seqs[0] == x)[-1]
+                    sorted_seqs[0, ref_ind] = -1
+                    ref_inds.append(ref_ind)
+
+                inds = sorted_seqs[1, ref_inds].ravel()
+
+                # Exclude the CLS tokens to put them at the end later
+                input_id_pack = list(itertools.chain(*[self.unpacked_input_ids[x][skip_cls:] for x in inds]))
+                attention_mask_pack = list(
+                    itertools.chain(
+                        *[
+                            itertools.repeat(n + 1, len(self.unpacked_attention_mask[v]) - skip_cls)
+                            for n, v in enumerate(inds)
+                        ]
+                    )
+                )
+                token_type_ids_pack = list(
+                    itertools.chain(*[self.unpacked_token_type_ids[x][skip_cls:] for x in inds])
+                )
+                position_ids_pack = list(
+                    itertools.chain(
+                        *[range(skip_cls, len(self.unpacked_attention_mask[v])) for n, v in enumerate(inds)]
+                    )
+                )
+
+                # Create the equivalent tokenised packed dataset - we operate with python arrays due to inhomogenous dataset size
+                self.packed_input_ids[pack_index, : len(input_id_pack)] = input_id_pack
+                self.packed_attention_mask[pack_index, : len(attention_mask_pack)] = attention_mask_pack
+                self.packed_token_type_ids[pack_index, : len(token_type_ids_pack)] = token_type_ids_pack
+                self.packed_position_ids[pack_index, : len(position_ids_pack)] = position_ids_pack
+
+                if self.problem_type == "single_label_classification":
+                    if self.training or self.validation:
+                        labels_pack = [self.unpacked_labels[x] for x in inds]
+                        self.packed_labels[pack_index, : len(labels_pack)] = labels_pack
+                    if self.inference:
+                        example_ids_pack = inds
+                        self.packed_example_ids[pack_index, : len(example_ids_pack)] = example_ids_pack
+
+                if self.problem_type == "multi_label_classification":
+                    if self.training or self.validation:
+                        labels_pack = np.stack([self.unpacked_labels[x] for x in inds])
+                        self.packed_labels[pack_index, : labels_pack.shape[0], :] = labels_pack
+                    if self.inference:
+                        example_ids_pack = inds
+                        self.packed_example_ids[pack_index, : len(example_ids_pack)] = example_ids_pack
+
+                if self.problem_type == "question_answering":
+                    if self.training:
+                        start_positions_pack = [
+                            max(self.unpacked_start_positions[v] + positions_offset[n], 0) for n, v in enumerate(inds)
+                        ]
+                        end_positions_pack = [
+                            max(self.unpacked_end_positions[v] + positions_offset[n], 0) for n, v in enumerate(inds)
+                        ]
+                        self.packed_start_positions[pack_index, : len(start_positions_pack)] = start_positions_pack
+                        self.packed_end_positions[pack_index, : len(end_positions_pack)] = end_positions_pack
+
+                    if self.validation or self.inference:
+                        example_ids_pack = [self.unpacked_example_ids[x] for x in inds]
+                        offset_mapping_pack = list(itertools.chain(*[self.unpacked_offset_mapping[x] for x in inds]))
+
+                        self.packed_example_ids[pack_index, : len(example_ids_pack)] = example_ids_pack
+                        self.packed_offset_mapping[pack_index, : len(offset_mapping_pack)] = offset_mapping_pack
+
+                # Now add the CLS tokens and their masks at the end of the pack if classification task
+                if skip_cls:
+                    self.packed_input_ids[pack_index, -self.max_seq_per_pack :] = [
+                        self.unpacked_input_ids[0][0] for _ in range(self.max_seq_per_pack)
+                    ]
+                    self.packed_attention_mask[pack_index, -self.max_seq_per_pack :] = list(
+                        range(1, self.max_seq_per_pack + 1)
+                    )
+
+                pack_index += 1
+
+        print(f"Packed dataset creation time: {round(time.time()-st, 4)}s")
+
+        if self.problem_type == "single_label_classification" or self.problem_type == "multi_label_classification":
+            return PackedClassificationDataset(
+                input_ids=self.packed_input_ids,
+                attention_mask=self.packed_attention_mask,
+                token_type_ids=self.packed_token_type_ids,
+                position_ids=self.packed_position_ids,
+                labels=self.packed_labels,
+                example_ids=self.packed_example_ids,
+            )
+
+        if self.problem_type == "question_answering":
+            return PackedQuestionAnsweringDataset(
+                input_ids=self.packed_input_ids,
+                attention_mask=self.packed_attention_mask,
+                token_type_ids=self.packed_token_type_ids,
+                position_ids=self.packed_position_ids,
+                start_positions=self.packed_start_positions,
+                end_positions=self.packed_end_positions,
+                offset_mapping=self.packed_offset_mapping,
+                example_ids=self.packed_example_ids,
+            )
diff --git a/packed-bert/utils/packing/dataset_templates.py b/packed-bert/utils/packing/dataset_templates.py
new file mode 100644
index 0000000..90cb3d6
--- /dev/null
+++ b/packed-bert/utils/packing/dataset_templates.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from torch.utils.data import Dataset
+
+
+class PackedClassificationDataset(Dataset):
+    def __init__(self, input_ids, attention_mask, token_type_ids, position_ids, labels=None, example_ids=None):
+        self.input_ids = input_ids
+        self.attention_mask = attention_mask
+        self.token_type_ids = token_type_ids
+        self.position_ids = position_ids
+        self.labels = labels
+        self.example_ids = example_ids
+
+    def __len__(self):
+        return len(self.input_ids)
+
+    def __getitem__(self, index):
+        input_ids = self.input_ids[index]
+        attention_masks = self.attention_mask[index]
+        token_type_ids = self.token_type_ids[index]
+        position_ids = self.position_ids[index]
+        labels = self.labels[index] if self.labels is not None else None
+        example_ids = self.example_ids[index] if self.example_ids is not None else None
+
+        sample = {
+            "input_ids": input_ids,
+            "attention_mask": attention_masks,
+            "token_type_ids": token_type_ids,
+            "position_ids": position_ids,
+        }
+
+        if self.labels is not None:
+            sample["labels"] = labels
+
+        if self.example_ids is not None:
+            sample["example_ids"] = example_ids
+
+        return sample
+
+
+class PackedQuestionAnsweringDataset(Dataset):
+    def __init__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        start_positions,
+        end_positions,
+        offset_mapping,
+        example_ids,
+    ):
+        self.input_ids = input_ids
+        self.attention_mask = attention_mask
+        self.token_type_ids = token_type_ids
+        self.position_ids = position_ids
+        self.start_positions = start_positions
+        self.end_positions = end_positions
+        self.offset_mapping = offset_mapping
+        self.example_ids = example_ids
+
+    def __len__(self):
+        return len(self.input_ids)
+
+    def __getitem__(self, index):
+        input_ids = self.input_ids[index]
+        attention_masks = self.attention_mask[index]
+        token_type_ids = self.token_type_ids[index]
+        position_ids = self.position_ids[index]
+
+        start_positions = self.start_positions[index] if self.start_positions is not None else None
+        end_positions = self.end_positions[index] if self.end_positions is not None else None
+
+        offset_mapping = self.offset_mapping[index] if self.offset_mapping is not None else None
+        example_ids = self.example_ids[index] if self.example_ids is not None else None
+
+        sample = {
+            "input_ids": input_ids,
+            "attention_mask": attention_masks,
+            "token_type_ids": token_type_ids,
+            "position_ids": position_ids,
+        }
+
+        if self.start_positions is not None and self.end_positions is not None:
+            sample["start_positions"] = start_positions
+            sample["end_positions"] = end_positions
+
+        if self.offset_mapping is not None and self.example_ids is not None:
+            sample["offset_mapping"] = offset_mapping
+            sample["example_ids"] = example_ids
+
+        return sample
diff --git a/packed-bert/utils/packing/qa_utils.py b/packed-bert/utils/packing/qa_utils.py
new file mode 100644
index 0000000..528e653
--- /dev/null
+++ b/packed-bert/utils/packing/qa_utils.py
@@ -0,0 +1,243 @@
+# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+
+import numpy as np
+from datasets import Dataset
+from tqdm import tqdm
+
+from transformers import AutoTokenizer
+
+
+def preprocess_packed_qa(
+    dataset,
+    tokenizer,
+    question_key: str = "question",
+    context_key: str = "context",
+    answer_key: str = "answer",
+    sequence_length: int = 384,
+    padding: bool = True,
+    train: bool = True,
+):
+    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
+    # in one example possible giving several features when a context is long, each of those features having a
+    # context that overlaps a bit the context of the previous feature.
+
+    pad_on_right = tokenizer.padding_side == "right"
+
+    tokenized_dataset = tokenizer(
+        dataset[question_key if pad_on_right else context_key],
+        dataset[context_key if pad_on_right else question_key],
+        truncation="only_second" if pad_on_right else "only_first",
+        max_length=sequence_length,
+        return_overflowing_tokens=True,
+        return_offsets_mapping=True,
+        padding=padding,
+    )
+
+    sample_mapping = tokenized_dataset.pop("overflow_to_sample_mapping")
+
+    if train:
+        dataset_answers = dataset[answer_key]
+        start_positions = []
+        end_positions = []
+
+        offset_mapping = tokenized_dataset.pop("offset_mapping")
+
+        for i, offsets in enumerate(tqdm(offset_mapping)):
+            # We will label impossible answers with the index of the CLS token.
+            input_ids = tokenized_dataset["input_ids"][i]
+            cls_index = input_ids.index(tokenizer.cls_token_id)
+
+            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+            sequence_ids = tokenized_dataset.sequence_ids(i)
+
+            # One example can give several spans, this is the index of the example containing this span of text.
+            sample_index = sample_mapping[i]
+            answers = dataset_answers[sample_index]
+
+            # If no answers are given, set the cls_index as answer.
+            if len(answers["answer_start"]) == 0:
+                start_positions.append(cls_index)
+                end_positions.append(cls_index)
+            else:
+                # Start/end character index of the answer in the text.
+                start_char = answers["answer_start"][0]
+                end_char = start_char + len(answers["text"][0])
+                # Start token index of the current span in the text.
+                token_start_index = 0
+                while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
+                    token_start_index += 1
+                # End token index of the current span in the text.
+                token_end_index = len(input_ids) - 1
+                while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
+                    token_end_index -= 1
+                # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
+                if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
+                    start_positions.append(cls_index)
+                    end_positions.append(cls_index)
+                else:
+                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
+                    # Note: we could go after the last offset if the answer is the last word (edge case).
+                    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
+                        token_start_index += 1
+                    start_positions.append(token_start_index - 1)
+                    while offsets[token_end_index][1] >= end_char:
+                        token_end_index -= 1
+                    end_positions.append(token_end_index + 1)
+
+        tokenized_dataset["start_positions"] = start_positions
+        tokenized_dataset["end_positions"] = end_positions
+
+        return Dataset.from_dict(tokenized_dataset)
+
+    else:
+        # We keep the example_id that gave us this feature and we will store the offset mappings.
+        tokenized_dataset["example_id"] = []
+        dataset_ids = dataset["id"]
+
+        for i in range(len(tokenized_dataset["input_ids"])):
+            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+            sequence_ids = tokenized_dataset.sequence_ids(i)
+            context_index = 1 if pad_on_right else 0
+
+            # One example can give several spans, this is the index of the example containing this span of text.
+            sample_index = sample_mapping[i]
+            tokenized_dataset["example_id"].append(dataset_ids[sample_index])
+
+            # Set to 0 the offset_mapping that are not part of the context so it's easy to determine if a token
+            # position is part of the context or not.
+            tokenized_dataset["offset_mapping"][i] = [
+                (o if sequence_ids[k] == context_index else tuple((0, 0)))
+                for k, o in enumerate(tokenized_dataset["offset_mapping"][i])
+            ]
+
+        return Dataset.from_dict(tokenized_dataset)
+
+
+def postprocess_packed_qa_predictions(
+    raw_val_dataset,
+    tokenized_val_dataset,
+    raw_predictions,
+    n_best_size=20,
+    max_answer_length=30,
+    squad_v2=False,
+    cls_token_id=101,
+):
+    all_start_logits, all_end_logits = raw_predictions
+
+    # The dataloader drop_last affects the dataset size due to the global batch size, so the number of predictions may be slightly less than the total amount of validation samples available:
+    dataloader_cap = all_start_logits.shape[0]
+
+    # Build a map example to its corresponding features.
+    example_id_to_index = {k: i for i, k in enumerate(raw_val_dataset["id"])}
+
+    features_per_example = collections.defaultdict(list)
+
+    for i, feature in enumerate(tokenized_val_dataset):
+        for j, example_id in enumerate(feature["example_ids"]):
+            if example_id != "":
+                features_per_example[example_id_to_index[example_id]].append([i, j])
+
+    # The dictionaries we have to fill.
+    predictions = collections.OrderedDict()
+
+    # Logging.
+    print(
+        f"Post-processing {len(raw_val_dataset)} example predictions split into {len(tokenized_val_dataset)} features."
+    )
+
+    # Let's loop over all the examples!
+    for example_index, example in enumerate(tqdm(raw_val_dataset)):
+        # Those are the indices of the features associated to the current example.
+        feature_indices = features_per_example[example_index]
+
+        min_null_score = None  # Only used if squad_v2 is True.
+        valid_answers = []
+
+        context = example["context"]
+        # Looping through all the features associated to the current example.
+        for feature_index in feature_indices:
+            # Separate the feature index and the pack index (i.e. the index of the feature in the pack)
+            pack_index, sequence_in_pack_index = feature_index
+
+            # We want to ignore any indices of packs which were ignored by the validation loop due to the dataloader dropping uneven batches.
+            if pack_index >= dataloader_cap:
+                continue
+
+            # We grab the predictions of the model for this feature to map character-level spans from the offset.
+            start_logits = all_start_logits[pack_index, sequence_in_pack_index]
+            end_logits = all_end_logits[pack_index, sequence_in_pack_index]
+
+            # Update minimum null prediction.
+            offset_mapping = tokenized_val_dataset[pack_index]["offset_mapping"]
+
+            # If squad_v2 dataset is used, we need to account for null predictions; we determine the minimum null score using input_ids to find the cls_index of the current sequence in the pack.
+            if squad_v2:
+                input_ids = tokenized_val_dataset[pack_index]["input_ids"]
+
+                cls_indices = [k for k, v in enumerate(input_ids) if v == int(cls_token_id)]
+                cls_index = cls_indices[sequence_in_pack_index]
+
+                # Since we know the relevant CLS index for this sequence in the pack, the null score can be evaluated
+                feature_null_score = start_logits[cls_index] + end_logits[cls_index]
+
+                if min_null_score is None or min_null_score < feature_null_score:
+                    min_null_score = feature_null_score
+
+            # Go through all possibilities for the `n_best_size` greater start and end logits.
+            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
+            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
+
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond to part of the input_ids that are not in the context.
+                    if (
+                        start_index >= len(offset_mapping)
+                        or end_index >= len(offset_mapping)
+                        or offset_mapping[start_index] is None
+                        or offset_mapping[end_index] is None
+                        or offset_mapping[start_index] == []
+                        or offset_mapping[end_index] == []
+                    ):
+                        continue
+                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
+                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
+                        continue
+
+                    start_char = offset_mapping[start_index][0]
+                    end_char = offset_mapping[end_index][1]
+                    valid_answers.append(
+                        {
+                            "score": start_logits[start_index] + end_logits[end_index],
+                            "text": context[start_char:end_char],
+                        }
+                    )
+
+        if len(valid_answers) > 0:
+            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
+        else:
+            # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
+            # failure.
+            best_answer = {"text": "", "score": 0.0}
+
+        # Let's pick our final answer: the best one or the null answer (only for squad_v2)
+        if not squad_v2:
+            predictions[example["id"]] = best_answer["text"]
+        else:
+            answer = best_answer["text"] if best_answer["score"] > min_null_score else ""
+            predictions[example["id"]] = answer
+
+    return predictions