@@ -31,10 +31,6 @@ There will be other versions of this code that specialize it and make it fast.
3131// defines: dataloader_init, dataloader_reset, dataloader_next_batch, dataloader_free
3232#include " llmc/dataloader.h"
3333
34- // CPU reference implementations
35- #include < iostream>
36- // #include "gpt2_cpu.hpp"
37-
3834using namespace gpu ;
3935
4036// ----------------------------------------------------------------------------
@@ -70,26 +66,6 @@ typedef struct {
7066 float * lnfb; // (C)
7167} ParameterTensors;
7268
73-
74- typedef struct {
75- Tensor wte; // (V, C)
76- Tensor wpe; // (maxT, C)
77- Tensor ln1w; // (L, C)
78- Tensor ln1b; // (L, C)
79- Tensor qkvw; // (L, 3*C, C)
80- Tensor qkvb; // (L, 3*C)
81- Tensor attprojw; // (L, C, C)
82- Tensor attprojb; // (L, C)
83- Tensor ln2w; // (L, C)
84- Tensor ln2b; // (L, C)
85- Tensor fcw; // (L, 4*C, C)
86- Tensor fcb; // (L, 4*C)
87- Tensor fcprojw; // (L, C, 4*C)
88- Tensor fcprojb; // (L, C)
89- Tensor lnfw; // (C)
90- Tensor lnfb; // (C)
91- } GPUParameterTensors;
92-
9369void fill_in_parameter_sizes (size_t * param_sizes, GPT2Config config) {
9470 size_t Vp = config.padded_vocab_size ;
9571 size_t C = config.channels ;
@@ -164,32 +140,6 @@ typedef struct {
164140} ActivationTensors;
165141
166142
167- typedef struct {
168- Tensor encoded; // (B, T, C)
169- Tensor ln1; // (L, B, T, C)
170- Tensor ln1_mean; // (L, B, T)
171- Tensor ln1_rstd; // (L, B, T)
172- Tensor qkv; // (L, B, T, 3*C)
173- Tensor atty; // (L, B, T, C)
174- Tensor preatt; // (L, B, NH, T, T)
175- Tensor att; // (L, B, NH, T, T)
176- Tensor attproj; // (L, B, T, C)
177- Tensor residual2; // (L, B, T, C)
178- Tensor ln2; // (L, B, T, C)
179- Tensor ln2_mean; // (L, B, T)
180- Tensor ln2_rstd; // (L, B, T)
181- Tensor fch; // (L, B, T, 4*C)
182- Tensor fch_gelu; // (L, B, T, 4*C)
183- Tensor fcproj; // (L, B, T, C)
184- Tensor residual3; // (L, B, T, C)
185- Tensor lnf; // (B, T, C)
186- Tensor lnf_mean; // (B, T)
187- Tensor lnf_rstd; // (B, T)
188- Tensor logits; // (B, T, V)
189- Tensor probs; // (B, T, V)
190- Tensor losses; // (B, T)
191- } GPUActivationTensors;
192-
193143
194144void fill_in_activation_sizes (size_t * act_sizes, GPT2Config config, int B, int T) {
195145 size_t C = config.channels ;
@@ -241,10 +191,26 @@ float* malloc_and_point_activations(ActivationTensors* acts, size_t* act_sizes)
241191 return acts_memory;
242192}
243193
194+ struct GPUParameters {
195+ Tensor data[NUM_PARAMETER_TENSORS];
196+ };
197+
198+ struct GPUActivations {
199+ Tensor data[NUM_ACTIVATION_TENSORS];
200+ };
201+
202+
203+ void gpu_alloc (Context& ctx, Tensor* tensors, size_t * sizes, size_t n) {
204+ for (size_t i = 0 ; i < n; i++) {
205+ tensors[i] = createTensor (ctx, Shape{sizes[i]}, kf32);
206+ }
207+ }
208+
244209typedef struct {
245210 GPT2Config config;
246211 // the weights (parameters) of the model, and their sizes
247212 ParameterTensors params;
213+ GPUParameters params_; // TODO(avh): eventually this replaces params
248214 size_t param_sizes[NUM_PARAMETER_TENSORS];
249215 float * params_memory;
250216 size_t num_parameters;
@@ -256,6 +222,7 @@ typedef struct {
256222 float * v_memory;
257223 // the activations of the model, and their sizes
258224 ActivationTensors acts;
225+ GPUActivations acts_; // TODO(avh): eventually this replaces params
259226 size_t act_sizes[NUM_ACTIVATION_TENSORS];
260227 float * acts_memory;
261228 size_t num_activations;
@@ -270,7 +237,7 @@ typedef struct {
270237 float mean_loss; // after a forward pass with targets, will be populated with the mean loss
271238} GPT2;
272239
273- void gpt2_build_from_checkpoint (GPT2 *model, const char * checkpoint_path) {
240+ void gpt2_build_from_checkpoint (Context& ctx, GPT2 *model, const char * checkpoint_path) {
274241
275242 // read in model from a checkpoint file
276243 FILE *model_file = fopenCheck (checkpoint_path, " rb" );
@@ -330,6 +297,10 @@ void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) {
330297 model->batch_size = 0 ;
331298 model->seq_len = 0 ;
332299 model->mean_loss = -1 .0f ; // -1.0f will designate no loss
300+
301+ // TODO(avh): this is just a resource test for now, eventually deprecate CPU allocations
302+ gpu_alloc (ctx, model->params_ .data , model->param_sizes , NUM_PARAMETER_TENSORS);
303+
333304}
334305
335306
@@ -364,6 +335,8 @@ void gpt2_forward(Context& ctx, GPT2 *model, int* inputs, int* targets, size_t B
364335 model->seq_len = T;
365336 // and now allocate the space
366337 fill_in_activation_sizes (model->act_sizes , model->config , B, T);
338+ // TODO(avh): this is just a resource test for now, eventually deprecate CPU allocations
339+ gpu_alloc (ctx, model->acts_ .data , model->act_sizes , NUM_PARAMETER_TENSORS);
367340 size_t num_activations = 0 ;
368341 for (size_t i = 0 ; i < NUM_ACTIVATION_TENSORS; i++) {
369342 num_activations += model->act_sizes [i];
@@ -678,11 +651,18 @@ int sample_mult(float* probabilities, int n, float coin) {
678651// main training loop
679652int main () {
680653
681- setLogLevel (kError );
654+ setLogLevel (kWarn );
655+
656+ printf (" Creating GPU context\n " );
657+ WGPURequiredLimits requiredLimits = LIMITS_BUFFER_SIZE_1GB;
658+ gpu::Context ctx = gpu::createContext ({}, {}, {
659+ .requiredLimits = &requiredLimits
660+ });
661+ // gpu::Context ctx = gpu::createContext();
682662
683663 // build the GPT-2 model from a checkpoint
684664 GPT2 model;
685- gpt2_build_from_checkpoint (&model, " gpt2_124M.bin" );
665+ gpt2_build_from_checkpoint (ctx, &model, " gpt2_124M.bin" );
686666
687667 // build the DataLoaders from tokens files. for now use tiny_shakespeare if available, else tiny_stories
688668 const char * tiny_stories_train = " dev/data/tinystories/TinyStories_train.bin" ;
@@ -709,13 +689,7 @@ int main() {
709689 int * gen_tokens = (int *)mallocCheck (B * T * sizeof (int ));
710690 const int genT = 64 ; // number of steps of inference we will do
711691
712- printf (" Creating GPU context\n " );
713- WGPURequiredLimits requiredLimits = LIMITS_BUFFER_SIZE_1GB;
714- gpu::Context ctx = gpu::createContext ({}, {}, {
715- .requiredLimits = &requiredLimits
716- });
717- // gpu::Context ctx = gpu::createContext();
718-
692+
719693 // train
720694 struct timespec start, end;
721695 printf (" Starting training\n " );
0 commit comments