diff --git a/Analysis.txt.txt b/Analysis.txt.txt
new file mode 100644
index 0000000..2bd9a93
--- /dev/null
+++ b/Analysis.txt.txt
@@ -0,0 +1,24 @@
+N_500: 2475-2550 -> 2512
+N_2000: 1446-1475 -> 1460.5
+N_5000: 810-825 -> 817.5
+N_8000: 548-564 -> 556
+N_20000: 211-216-> 213.5
+N_50000: 37-39 -> 38
+N_100000: 10-11 -> 10.5
+
+U_500: 1750-1920 -> 1835
+U_2000: 1851-1887 -> 1869
+U_5000: 1482-1497 -> 1489.5
+U_8000: 1444-1475 -> 1459.5
+U_20000: 1110-1192 -> 1151
+U_50000: 506-580 -> 543
+U_100000: 280-300 -> 290
+
+C_500: 1880-1970 -> 1925
+C_2000: 1842-1885 -> 1863.5
+C_5000: 1495-1520 -> 1507.5
+C_8000: 1486-1526 -> 1506
+C_20000: 1460-1480 -> 1470
+C_50000: 1117-1174 -> 1145.5
+C_100000: 718-728 -> 713
+
diff --git a/README.md b/README.md
index d63a6a1..19f5927 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,31 @@
+<h1> CUDA Flocking Simulation with Reynold Boids Algorithm
+
+# ![top](images/top_image.png)
+
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
+* Haorong Yang
+* [LinkedIn](https://www.linkedin.com/in/haorong-henry-yang/)
+* Tested on: Windows 10 Home, i7-10750H @ 2.60GHz 16GB, GTX 2070 Super Max-Q (Personal)
+
+
+
+This is a CUDA based simulation of the Reynold Boids Algorithm. Three approaches were taken to perform neighbor search.
+The Naive approach does a brute force search through every other boid to update one boid.
+The Uniform Grid and Coherent Grid searches through the cells within its search radius, 
+but Coherent Grid has a modification in the method of accessing elements.
+
+The method used to compare performance was by frame rate under different boid counts.
+I took the lowest and highest frame rate that appeared at least twice within the first minute, 
+and took the average of that as the performance result.
+
+Comparison of 3 methods of step simulation, at most 8 neighbors (Naive Search, Uniform Grid, Coherent Grid):
+![chart1](images/fpsGraph8.PNG)
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+Comparison of 3 methods of step simulation, at most 27 neighbors (Naive Search, Uniform Grid, Coherent Grid):
+![chart1](images/fpsGraph27.PNG)
 
-### (TODO: Your README)
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+* As the number of boids increase, the performance decreases as we could observe that the frame rates decrease. The reason for this is the arrays become longer and there is simply more calculation to do as there are more boids.
+* The coherent grid method showed greater performance as the boid count increases to a large number. At smaller number of boids, there is barely noticable increase in performance compared to the uniform grid. This is the outcome as I expected, because as the boid count increases, the cost to access boidIndices and then positions and velocity becomes more significant since they are performed in for every single neighbor search, whereas shuffling of the position and velocity arrays only need to happen once every step simulation
+* In 27 neighbor search, coherent still performs the best.
diff --git a/images/fpsGraph27.PNG b/images/fpsGraph27.PNG
new file mode 100644
index 0000000..545c6da
Binary files /dev/null and b/images/fpsGraph27.PNG differ
diff --git a/images/fpsGraph8.PNG b/images/fpsGraph8.PNG
new file mode 100644
index 0000000..bb3c35d
Binary files /dev/null and b/images/fpsGraph8.PNG differ
diff --git a/images/top_image.png b/images/top_image.png
new file mode 100644
index 0000000..b33c6e1
Binary files /dev/null and b/images/top_image.png differ
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..fcd28c7 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -76,9 +76,11 @@ glm::vec3 *dev_vel2;
 // For efficient sorting and the uniform grid. These should always be parallel.
 int *dev_particleArrayIndices; // What index in dev_pos and dev_velX represents this particle?
 int *dev_particleGridIndices; // What grid cell is this particle in?
+glm::vec3 *dev_dummyPos;
+glm::vec3 *dev_dummyVel1;
 // needed for use with thrust
-thrust::device_ptr<int> dev_thrust_particleArrayIndices;
-thrust::device_ptr<int> dev_thrust_particleGridIndices;
+//thrust::device_ptr<int> dev_thrust_particleArrayIndices;
+//thrust::device_ptr<int> dev_thrust_particleGridIndices;
 
 int *dev_gridCellStartIndices; // What part of dev_particleArrayIndices belongs
 int *dev_gridCellEndIndices;   // to this cell?
@@ -157,7 +159,8 @@ void Boids::initSimulation(int N) {
   checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!");
 
   // LOOK-2.1 computing grid params
-  gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
+  // makes the cell width twice the maximum search radius
+  gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance); 
   int halfSideCount = (int)(scene_scale / gridCellWidth) + 1;
   gridSideCount = 2 * halfSideCount;
 
@@ -169,6 +172,25 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+  cudaMalloc((void**)&dev_dummyPos, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_dummyPos failed!");
+
+  cudaMalloc((void**)&dev_dummyVel1, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_dummyVel1 failed!");
+
+  cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridEndStartIndices failed!");
+
+  // end of my code
   cudaDeviceSynchronize();
 }
 
@@ -230,10 +252,41 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities)
 * in the `pos` and `vel` arrays.
 */
 __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) {
-  // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
-  // Rule 2: boids try to stay a distance d away from each other
-  // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+  glm::vec3 thisPos = pos[iSelf];
+  glm::vec3 thisVel = vel[iSelf];
+  glm::vec3 perceivedCenter = glm::vec3(0.0f, 0.0f, 0.0f);
+  glm::vec3 separation = glm::vec3(0.0f, 0.0f, 0.0f);
+  glm::vec3 cohesion = glm::vec3(0.0f, 0.0f, 0.0f);
+  int neighborCount = 0;
+  for (int b = 0; b < N; b++) {
+    if (b == iSelf) {
+      continue;
+    }
+    glm::vec3 bPos = pos[b];
+    glm::vec3 bVel = vel[b];
+    float distance = glm::length(thisPos - bPos);
+    // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+    if (distance < rule1Distance) {
+      perceivedCenter += bPos;
+      neighborCount += 1;
+    }
+    // Rule 2: boids try to stay a distance d away from each other
+    if (distance < rule2Distance) {
+      separation -= (bPos - thisPos);
+    }
+    // Rule 3: boids try to match the speed of surrounding boids
+    if (distance < rule3Distance) {
+      cohesion += bVel;
+    }
+  }
+  if (neighborCount > 0) {
+    perceivedCenter /= neighborCount;
+    cohesion /= neighborCount;
+    thisVel += (perceivedCenter - thisPos) * rule1Scale;
+    thisVel += cohesion * rule3Scale;
+  }
+  thisVel += separation * rule2Scale;
+  return thisVel;
 }
 
 /**
@@ -245,6 +298,15 @@ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
   // Compute a new velocity based on pos and vel1
   // Clamp the speed
   // Record the new velocity into vel2. Question: why NOT vel1?
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N) {
+    return;
+  }
+  glm::vec3 newVel = computeVelocityChange(N, index, pos, vel1);
+  if (glm::length(newVel) > maxSpeed) {
+    newVel = (newVel / glm::length(newVel)) * maxSpeed;
+  }
+  vel2[index] = newVel;
 }
 
 /**
@@ -285,10 +347,25 @@ __device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) {
 __global__ void kernComputeIndices(int N, int gridResolution,
   glm::vec3 gridMin, float inverseCellWidth,
   glm::vec3 *pos, int *indices, int *gridIndices) {
-    // TODO-2.1
-    // - Label each boid with the index of its grid cell.
-    // - Set up a parallel array of integer indices as pointers to the actual
-    //   boid data in pos and vel1/vel2
+  // TODO-2.1
+  // - Label each boid with the index of its grid cell.
+  // - Set up a parallel array of integer indices as pointers to the actual
+  //   boid data in pos and vel1/vel2
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N) {
+    return;
+  }
+  glm::vec3 thisPos = pos[index];
+  glm::vec3 gridSpacePos = thisPos - gridMin;
+  //float sideLength = gridResolution / inverseCellWidth;
+  // want to find out the cell index that i am in and label it
+  int gridIdx = gridIndex3Dto1D(glm::floor(gridSpacePos.x * inverseCellWidth),
+                                glm::floor(gridSpacePos.y * inverseCellWidth),
+                                glm::floor(gridSpacePos.z * inverseCellWidth),
+                                gridResolution);
+  gridIndices[index] = gridIdx;
+  // store the boid indices
+  indices[index] = index;
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -306,14 +383,34 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (index >= N) {
+    return;
+  }
+  int gridIdx = particleGridIndices[index];
+  // update start indices
+  if (index == 0) {
+    gridCellStartIndices[gridIdx] = 0;
+  } 
+  else if (particleGridIndices[index] != particleGridIndices[index - 1]) {
+    gridCellStartIndices[gridIdx] = index;
+  }
+  // update end indices
+  if (index == N - 1) {
+    gridCellEndIndices[gridIdx] = N - 1;
+  }
+  else if (particleGridIndices[index] != particleGridIndices[index + 1]) {
+    gridCellEndIndices[gridIdx] = index;
+  }
+  // the grids that don't have boids should have value -1
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
   int N, int gridResolution, glm::vec3 gridMin,
   float inverseCellWidth, float cellWidth,
-  int *gridCellStartIndices, int *gridCellEndIndices,
-  int *particleArrayIndices,
-  glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
+  int* gridCellStartIndices, int* gridCellEndIndices,
+  int* particleArrayIndices, int* particleGridIndices,
+  glm::vec3* pos, glm::vec3* vel1, glm::vec3* vel2) {
   // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
   // the number of boids that need to be checked.
   // - Identify the grid cell that this particle is in
@@ -322,8 +419,364 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (index >= N) {
+    return;
+  }
+  glm::vec3 thisPos = pos[index];
+  glm::vec3 thisVel = vel1[index];
+  float sideLength = gridResolution * cellWidth;
+
+  int searchGridStart[8];
+  int searchGridEnd[8];
+  for (int i = 0; i < 8; i++) {
+    searchGridStart[i] = -1;
+    searchGridEnd[i] = -1;
+  }
+  glm::vec3 roundPos = glm::vec3(glm::round(thisPos.x / cellWidth) * cellWidth,
+    glm::round(thisPos.y / cellWidth) * cellWidth,
+    glm::round(thisPos.z / cellWidth) * cellWidth);
+
+  int arrI = 0;
+  for (int i = -1; i <= 1; i += 2) {
+    for (int j = -1; j <= 1; j += 2) {
+      for (int k = -1; k <= 1; k += 2) {
+        glm::vec3 offset = glm::vec3(i * 0.5 * cellWidth,
+                                     j * 0.5 * cellWidth,
+                                     k * 0.5 * cellWidth);
+        glm::vec3 gridSpacePos = roundPos - gridMin + offset;
+        //glm::vec3 offset = glm::vec3(i * 0.5 * cellWidth);
+        //glm::vec3 gridSpacePos = roundPos - gridMin + offset;
+        if (gridSpacePos.x > sideLength || gridSpacePos.x < 0.0f ||
+          gridSpacePos.y > sideLength || gridSpacePos.y < 0.0f ||
+          gridSpacePos.z > sideLength || gridSpacePos.z < 0.0f) {
+          continue;
+        }
+        int tempGridIdx = gridIndex3Dto1D(glm::floor(gridSpacePos.x * inverseCellWidth),
+                                          glm::floor(gridSpacePos.y * inverseCellWidth),
+                                          glm::floor(gridSpacePos.z * inverseCellWidth),
+                                          gridResolution);
+        if (gridCellStartIndices[tempGridIdx] == -1) {
+          continue;
+        }
+        else {
+          searchGridStart[arrI] = gridCellStartIndices[tempGridIdx];
+          searchGridEnd[arrI] = gridCellEndIndices[tempGridIdx];
+          arrI++;
+        }
+      }
+    }
+  }
+
+  // iterate and update velocity
+  glm::vec3 perceivedCenter = glm::vec3(0.0f, 0.0f, 0.0f);
+  glm::vec3 separation = glm::vec3(0.0f, 0.0f, 0.0f);
+  glm::vec3 cohesion = glm::vec3(0.0f, 0.0f, 0.0f);
+  int neighborCount = 0;
+
+  for (int i = 0; i < 8; i++) {
+    if (i < 0 || i >= 8) {
+      break;
+    }
+    for (int j = searchGridStart[i]; j <= searchGridEnd[i]; j++) {
+      if (particleArrayIndices[j] == index) {
+        continue;
+      }
+      glm::vec3 bPos = pos[particleArrayIndices[j]];
+      glm::vec3 bVel = vel1[particleArrayIndices[j]];
+      float distance = glm::length(thisPos - bPos);
+
+      // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+      if (distance < rule1Distance) {
+        perceivedCenter += bPos;
+        neighborCount++;
+      }
+      // Rule 2: boids try to stay a distance d away from each other
+      if (distance < rule2Distance) {
+        separation -= (bPos - thisPos);
+      }
+      // Rule 3: boids try to match the speed of surrounding boids
+      if (distance < rule3Distance) {
+        cohesion += bVel;
+      }
+    }
+  }
+
+  if (neighborCount > 0) {
+    perceivedCenter /= neighborCount;
+    cohesion /= neighborCount;
+    thisVel += (perceivedCenter - thisPos) * rule1Scale;
+    thisVel += cohesion * rule3Scale;
+  }
+  thisVel += separation * rule2Scale;
+
+  if (glm::length(thisVel) > maxSpeed) {
+    thisVel = (thisVel / glm::length(thisVel)) * maxSpeed;
+  }
+  vel2[index] = thisVel;
+
 }
 
+
+
+__global__ void kernUpdateVelNeighborSearchScattered27(
+  int N, int gridResolution, glm::vec3 gridMin,
+  float inverseCellWidth, float cellWidth,
+  int* gridCellStartIndices, int* gridCellEndIndices,
+  int* particleArrayIndices, int* particleGridIndices,
+  glm::vec3* pos, glm::vec3* vel1, glm::vec3* vel2) {
+  // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
+  // the number of boids that need to be checked.
+  // - Identify the grid cell that this particle is in
+  // - Identify which cells may contain neighbors. This isn't always 8.
+  // - For each cell, read the start/end indices in the boid pointer array.
+  // - Access each boid in the cell and compute velocity change from
+  //   the boids rules, if this boid is within the neighborhood distance.
+  // - Clamp the speed change before putting the new speed in vel2
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (index >= N) {
+    return;
+  }
+  glm::vec3 thisPos = pos[index];
+  glm::vec3 thisVel = vel1[index];
+  float sideLength = gridResolution * cellWidth;
+
+  int searchGridStart[27];
+  int searchGridEnd[27];
+  for (int i = 0; i < 8; i++) {
+    searchGridStart[i] = -1;
+    searchGridEnd[i] = -1;
+  }
+  glm::vec3 gridSpacePos = thisPos - gridMin;
+  glm::vec3 midPos = glm::vec3(glm::floor(gridSpacePos.x / cellWidth) * cellWidth + 0.5*cellWidth,
+    glm::floor(gridSpacePos.y / cellWidth) * cellWidth + 0.5 * cellWidth,
+    glm::floor(gridSpacePos.z / cellWidth) * cellWidth + 0.5 * cellWidth);
+
+  int arrI = 0;
+  for (int i = -1; i <= 1; i += 1) {
+    for (int j = -1; j <= 1; j += 1) {
+      for (int k = -1; k <= 1; k += 1) {
+        glm::vec3 offset = glm::vec3(i * cellWidth,
+          j * cellWidth,
+          k * cellWidth);
+        glm::vec3 newPos = midPos + offset;
+        if (newPos.x > sideLength || newPos.x < 0.0f ||
+          newPos.y > sideLength || newPos.y < 0.0f ||
+          newPos.z > sideLength || newPos.z < 0.0f) {
+          continue;
+        }
+        int tempGridIdx = gridIndex3Dto1D(glm::floor(newPos.x * inverseCellWidth),
+          glm::floor(newPos.y * inverseCellWidth),
+          glm::floor(newPos.z * inverseCellWidth),
+          gridResolution);
+        if (gridCellStartIndices[tempGridIdx] == -1) {
+          continue;
+        }
+        else {
+          searchGridStart[arrI] = gridCellStartIndices[tempGridIdx];
+          searchGridEnd[arrI] = gridCellEndIndices[tempGridIdx];
+          arrI++;
+        }
+      }
+    }
+  }
+
+  // iterate and update velocity
+  glm::vec3 perceivedCenter = glm::vec3(0.0f, 0.0f, 0.0f);
+  glm::vec3 separation = glm::vec3(0.0f, 0.0f, 0.0f);
+  glm::vec3 cohesion = glm::vec3(0.0f, 0.0f, 0.0f);
+  int neighborCount = 0;
+
+  for (int i = 0; i < 27; i++) {
+    if (i < 0 || i >= 27) {
+      break;
+    }
+    for (int j = searchGridStart[i]; j <= searchGridEnd[i]; j++) {
+      if (particleArrayIndices[j] == index) {
+        continue;
+      }
+      glm::vec3 bPos = pos[particleArrayIndices[j]];
+      glm::vec3 bVel = vel1[particleArrayIndices[j]];
+      float distance = glm::length(thisPos - bPos);
+
+      // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+      if (distance < rule1Distance) {
+        perceivedCenter += bPos;
+        neighborCount++;
+      }
+      // Rule 2: boids try to stay a distance d away from each other
+      if (distance < rule2Distance) {
+        separation -= (bPos - thisPos);
+      }
+      // Rule 3: boids try to match the speed of surrounding boids
+      if (distance < rule3Distance) {
+        cohesion += bVel;
+      }
+    }
+  }
+
+  if (neighborCount > 0) {
+    perceivedCenter /= neighborCount;
+    cohesion /= neighborCount;
+    thisVel += (perceivedCenter - thisPos) * rule1Scale;
+    thisVel += cohesion * rule3Scale;
+  }
+  thisVel += separation * rule2Scale;
+
+  if (glm::length(thisVel) > maxSpeed) {
+    thisVel = (thisVel / glm::length(thisVel)) * maxSpeed;
+  }
+  vel2[index] = thisVel;
+
+}
+
+
+
+
+
+__global__ void kernUpdateVelNeighborSearchCoherent27(
+  int N, int gridResolution, glm::vec3 gridMin,
+  float inverseCellWidth, float cellWidth,
+  int* gridCellStartIndices, int* gridCellEndIndices,
+  glm::vec3* pos, glm::vec3* vel1, glm::vec3* vel2) {
+  // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
+  // the number of boids that need to be checked.
+  // - Identify the grid cell that this particle is in
+  // - Identify which cells may contain neighbors. This isn't always 8.
+  // - For each cell, read the start/end indices in the boid pointer array.
+  // - Access each boid in the cell and compute velocity change from
+  //   the boids rules, if this boid is within the neighborhood distance.
+  // - Clamp the speed change before putting the new speed in vel2
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (index >= N) {
+    return;
+  }
+  glm::vec3 thisPos = pos[index];
+  glm::vec3 thisVel = vel1[index];
+  float sideLength = gridResolution * cellWidth;
+
+  int searchGridStart[27];
+  int searchGridEnd[27];
+  for (int i = 0; i < 8; i++) {
+    searchGridStart[i] = -1;
+    searchGridEnd[i] = -1;
+  }
+  glm::vec3 gridSpacePos = thisPos - gridMin;
+  glm::vec3 midPos = glm::vec3(glm::floor(gridSpacePos.x / cellWidth) * cellWidth + 0.5 * cellWidth,
+    glm::floor(gridSpacePos.y / cellWidth) * cellWidth + 0.5 * cellWidth,
+    glm::floor(gridSpacePos.z / cellWidth) * cellWidth + 0.5 * cellWidth);
+
+  int arrI = 0;
+  for (int i = -1; i <= 1; i += 1) {
+    for (int j = -1; j <= 1; j += 1) {
+      for (int k = -1; k <= 1; k += 1) {
+        glm::vec3 offset = glm::vec3(i * cellWidth,
+          j * cellWidth,
+          k * cellWidth);
+        glm::vec3 newPos = midPos + offset;
+        if (newPos.x > sideLength || newPos.x < 0.0f ||
+          newPos.y > sideLength || newPos.y < 0.0f ||
+          newPos.z > sideLength || newPos.z < 0.0f) {
+          continue;
+        }
+        int tempGridIdx = gridIndex3Dto1D(glm::floor(newPos.x * inverseCellWidth),
+          glm::floor(newPos.y * inverseCellWidth),
+          glm::floor(newPos.z * inverseCellWidth),
+          gridResolution);
+        if (gridCellStartIndices[tempGridIdx] == -1) {
+          continue;
+        }
+        else {
+          searchGridStart[arrI] = gridCellStartIndices[tempGridIdx];
+          searchGridEnd[arrI] = gridCellEndIndices[tempGridIdx];
+          arrI++;
+        }
+      }
+    }
+  }
+
+  // iterate and update velocity
+  glm::vec3 perceivedCenter = glm::vec3(0.0f, 0.0f, 0.0f);
+  glm::vec3 separation = glm::vec3(0.0f, 0.0f, 0.0f);
+  glm::vec3 cohesion = glm::vec3(0.0f, 0.0f, 0.0f);
+  int neighborCount = 0;
+
+  for (int i = 0; i < 27; i++) {
+    if (i < 0 || i >= 27) {
+      break;
+    }
+    for (int j = searchGridStart[i]; j <= searchGridEnd[i]; j++) {
+      if (j == index) {
+        continue;
+      }
+      glm::vec3 bPos = pos[j];
+      glm::vec3 bVel = vel1[j];
+      float distance = glm::length(thisPos - bPos);
+
+      // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+      if (distance < rule1Distance) {
+        perceivedCenter += bPos;
+        neighborCount++;
+      }
+      // Rule 2: boids try to stay a distance d away from each other
+      if (distance < rule2Distance) {
+        separation -= (bPos - thisPos);
+      }
+      // Rule 3: boids try to match the speed of surrounding boids
+      if (distance < rule3Distance) {
+        cohesion += bVel;
+      }
+    }
+  }
+
+  if (neighborCount > 0) {
+    perceivedCenter /= neighborCount;
+    cohesion /= neighborCount;
+    thisVel += (perceivedCenter - thisPos) * rule1Scale;
+    thisVel += cohesion * rule3Scale;
+  }
+  thisVel += separation * rule2Scale;
+
+  if (glm::length(thisVel) > maxSpeed) {
+    thisVel = (thisVel / glm::length(thisVel)) * maxSpeed;
+  }
+  vel2[index] = thisVel;
+
+}
+
+
+
+
+
+
+// -----------------------------------------------------------------------------
+__global__ void kernCopyVel(int N, glm::vec3* buffer1, glm::vec3* buffer2) {
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (index >= N) {
+    return;
+  }
+  buffer1[index] = buffer2[index];
+}
+
+__global__ void kernCopyInt(int N, int* buffer1, int* buffer2) {
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (index >= N) {
+    return;
+  }
+  buffer1[index] = buffer2[index];
+}
+
+__global__ void kernCopyDummy(int N, glm::vec3* buffer1, glm::vec3* dummy, int* arrayIndices) {
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (index >= N) {
+    return;
+  }
+  buffer1[index] = dummy[arrayIndices[index]];
+}
+
+// -----------------------------------------------------------------------------
+
+
 __global__ void kernUpdateVelNeighborSearchCoherent(
   int N, int gridResolution, glm::vec3 gridMin,
   float inverseCellWidth, float cellWidth,
@@ -341,14 +794,117 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (index >= N) {
+    return;
+  }
+  glm::vec3 thisPos = pos[index];
+  glm::vec3 thisVel = vel1[index];
+  float sideLength = gridResolution * cellWidth;
+
+  int searchGridStart[8];
+  int searchGridEnd[8];
+  for (int i = 0; i < 8; i++) {
+    searchGridStart[i] = -1;
+    searchGridEnd[i] = -1;
+  }
+  glm::vec3 roundPos = glm::vec3(glm::round(thisPos.x / cellWidth) * cellWidth,
+    glm::round(thisPos.y / cellWidth) * cellWidth,
+    glm::round(thisPos.z / cellWidth) * cellWidth);
+
+  int arrI = 0;
+  for (int i = -1; i <= 1; i += 2) {
+    for (int j = -1; j <= 1; j += 2) {
+      for (int k = -1; k <= 1; k += 2) {
+        glm::vec3 offset = glm::vec3(i * 0.5 * cellWidth,
+          j * 0.5 * cellWidth,
+          k * 0.5 * cellWidth);
+        glm::vec3 gridSpacePos = roundPos - gridMin + offset;
+        if (gridSpacePos.x > sideLength || gridSpacePos.x < 0.0f ||
+          gridSpacePos.y > sideLength || gridSpacePos.y < 0.0f ||
+          gridSpacePos.z > sideLength || gridSpacePos.z < 0.0f) {
+          continue;
+        }
+        int tempGridIdx = gridIndex3Dto1D(glm::floor(gridSpacePos.x * inverseCellWidth),
+          glm::floor(gridSpacePos.y * inverseCellWidth),
+          glm::floor(gridSpacePos.z * inverseCellWidth),
+          gridResolution);
+        if (gridCellStartIndices[tempGridIdx] == -1) {
+          continue;
+        }
+        else {
+          searchGridStart[arrI] = gridCellStartIndices[tempGridIdx];
+          searchGridEnd[arrI] = gridCellEndIndices[tempGridIdx];
+          arrI++;
+        }
+      }
+    }
+  }
+
+  // iterate and update velocity
+  glm::vec3 perceivedCenter = glm::vec3(0.0f, 0.0f, 0.0f);
+  glm::vec3 separation = glm::vec3(0.0f, 0.0f, 0.0f);
+  glm::vec3 cohesion = glm::vec3(0.0f, 0.0f, 0.0f);
+  int neighborCount = 0;
+
+  for (int i = 0; (i < 8 && i != -1); i++) {
+    for (int j = searchGridStart[i]; j <= searchGridEnd[i]; j++) {
+      if (j == index) {
+        continue;
+      }
+      glm::vec3 bPos = pos[j];
+      glm::vec3 bVel = vel1[j];
+      float distance = glm::length(thisPos - bPos);
+
+      // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+      if (distance < rule1Distance) {
+        perceivedCenter += bPos;
+        neighborCount++;
+      }
+      // Rule 2: boids try to stay a distance d away from each other
+      if (distance < rule2Distance) {
+        separation -= (bPos - thisPos);
+      }
+      // Rule 3: boids try to match the speed of surrounding boids
+      if (distance < rule3Distance) {
+        cohesion += bVel;
+      }
+    }
+  }
+
+  if (neighborCount > 0) {
+    perceivedCenter /= neighborCount;
+    cohesion /= neighborCount;
+    thisVel += (perceivedCenter - thisPos) * rule1Scale;
+    thisVel += cohesion * rule3Scale;
+  }
+  thisVel += separation * rule2Scale;
+
+  if (glm::length(thisVel) > maxSpeed) {
+    thisVel = (thisVel / glm::length(thisVel)) * maxSpeed;
+  }
+  vel2[index] = thisVel;
 }
 
+
 /**
 * Step the entire N-body simulation by `dt` seconds.
 */
 void Boids::stepSimulationNaive(float dt) {
   // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
   // TODO-1.2 ping-pong the velocity buffers
+  dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+  kernUpdateVelocityBruteForce <<<fullBlocksPerGrid, blockSize>>> (numObjects, dev_pos, dev_vel1, dev_vel2);
+  checkCUDAErrorWithLine("kernUpdateVelocityBruteForce failed!");
+
+  kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
+  checkCUDAErrorWithLine("kernUpdatePos failed!");
+
+  // ping-pong
+  kernCopyVel << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_vel1, dev_vel2);
+  checkCUDAErrorWithLine("kernCopy Failed!");
+
+  
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
@@ -364,6 +920,45 @@ void Boids::stepSimulationScatteredGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed
+  dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+  kernComputeIndices <<<fullBlocksPerGrid, blockSize>>> (numObjects, gridSideCount,
+   gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+  checkCUDAErrorWithLine("kernComputeIndices failed!");
+
+  // use thrust to sort
+
+  thrust::device_ptr<int> dev_thrust_gridIndices(dev_particleGridIndices);
+  thrust::device_ptr<int> dev_thrust_boidIndices(dev_particleArrayIndices);
+  thrust::sort_by_key(dev_thrust_gridIndices, dev_thrust_gridIndices + numObjects, dev_thrust_boidIndices);
+
+  // preset start and end indices to -1 before updating them
+  dim3 blocksBasedOnCellCount((gridCellCount + blockSize - 1) / blockSize);
+  kernResetIntBuffer <<<blocksBasedOnCellCount, blockSize>>> (gridCellCount, dev_gridCellStartIndices, -1);
+  checkCUDAErrorWithLine("kernResetIntBuffer failed!");
+  kernResetIntBuffer << <blocksBasedOnCellCount, blockSize >> > (gridCellCount, dev_gridCellEndIndices, -1);
+  checkCUDAErrorWithLine("kernResetIntBuffer failed!");
+
+  // update start end indices
+  kernIdentifyCellStartEnd <<<fullBlocksPerGrid, blockSize >>> (numObjects, dev_particleGridIndices, 
+   dev_gridCellStartIndices, dev_gridCellEndIndices);
+  checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!");
+
+   //update velocity
+  kernUpdateVelNeighborSearchScattered27 << <fullBlocksPerGrid, blockSize >> > (
+    numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, 
+    dev_gridCellEndIndices, dev_particleArrayIndices, dev_particleGridIndices, dev_pos, dev_vel1, dev_vel2);
+  checkCUDAErrorWithLine("kernUpdateVelNeighborSearchScattered failed!");
+
+  // update position and ping-pong
+  kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
+  checkCUDAErrorWithLine("kernUpdatePos failed!");
+
+  // ping pong
+  kernCopyVel << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_vel1, dev_vel2);
+  checkCUDAErrorWithLine("kernCopy Failed!");
+
+
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
@@ -382,6 +977,56 @@ void Boids::stepSimulationCoherentGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+  dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+  kernComputeIndices << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount,
+    gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+  checkCUDAErrorWithLine("kernComputeIndices failed!");
+
+   //setup dummy arrays
+  kernCopyVel << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_dummyPos, dev_pos);
+  checkCUDAErrorWithLine("kernCopy Failed!");
+  kernCopyVel << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_dummyVel1, dev_vel1);
+  checkCUDAErrorWithLine("kernCopy Failed!");
+
+
+  // use thrust to sort
+  thrust::device_ptr<int> dev_thrust_gridIndices(dev_particleGridIndices);
+  thrust::device_ptr<int> dev_thrust_boidIndices(dev_particleArrayIndices);
+  thrust::sort_by_key(dev_thrust_gridIndices, dev_thrust_gridIndices + numObjects, dev_thrust_boidIndices);
+
+  kernCopyDummy << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_pos, dev_dummyPos, dev_particleArrayIndices);
+  checkCUDAErrorWithLine("kernCopyDummy Failed!");
+  kernCopyDummy << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_vel1, dev_dummyVel1, dev_particleArrayIndices);
+  checkCUDAErrorWithLine("kernCopyDummy Failed!");
+
+  
+
+  // preset start and end indices to -1 before updating them
+  dim3 blocksBasedOnCellCount((gridCellCount + blockSize - 1) / blockSize);
+  kernResetIntBuffer << <blocksBasedOnCellCount, blockSize >> > (gridCellCount, dev_gridCellStartIndices, -1);
+  checkCUDAErrorWithLine("kernResetIntBuffer failed!");
+  kernResetIntBuffer << <blocksBasedOnCellCount, blockSize >> > (gridCellCount, dev_gridCellEndIndices, -1);
+  checkCUDAErrorWithLine("kernResetIntBuffer failed!");
+
+  // update start end indices
+  kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices,
+    dev_gridCellStartIndices, dev_gridCellEndIndices);
+  checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!");
+
+  //update velocity
+  kernUpdateVelNeighborSearchCoherent27 << <fullBlocksPerGrid, blockSize >> > (
+    numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices,
+    dev_gridCellEndIndices, dev_pos, dev_vel1, dev_vel2);
+  checkCUDAErrorWithLine("kernUpdateVelNeighborSearchCoherent failed!");
+
+  // update position and ping-pong
+  kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
+  checkCUDAErrorWithLine("kernUpdatePos failed!");
+
+  // ping pong
+  kernCopyVel << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_vel1, dev_vel2);
+  checkCUDAErrorWithLine("kernCopy Failed!");
 }
 
 void Boids::endSimulation() {
@@ -390,6 +1035,13 @@ void Boids::endSimulation() {
   cudaFree(dev_pos);
 
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_particleGridIndices);
+  cudaFree(dev_gridCellStartIndices);
+  cudaFree(dev_gridCellEndIndices);
+
+  cudaFree(dev_dummyPos);
+  cudaFree(dev_dummyVel1);
 }
 
 void Boids::unitTest() {
@@ -397,11 +1049,15 @@ void Boids::unitTest() {
 
   // test unstable sort
   int *dev_intKeys;
+  int* dev_dummyKeys;
   int *dev_intValues;
+  glm::vec3* dev_newValues;
   int N = 10;
 
   std::unique_ptr<int[]>intKeys{ new int[N] };
+  std::unique_ptr<int[]>dummyKeys{ new int[N] };
   std::unique_ptr<int[]>intValues{ new int[N] };
+  std::unique_ptr<glm::vec3[]>newValues{ new glm::vec3[N] };
 
   intKeys[0] = 0; intValues[0] = 0;
   intKeys[1] = 1; intValues[1] = 1;
@@ -414,44 +1070,92 @@ void Boids::unitTest() {
   intKeys[8] = 5; intValues[8] = 8;
   intKeys[9] = 6; intValues[9] = 9;
 
+  dummyKeys[0] = 0; 
+  dummyKeys[1] = 1;
+  dummyKeys[2] = 0;
+  dummyKeys[3] = 3;
+  dummyKeys[4] = 0;
+  dummyKeys[5] = 2;
+  dummyKeys[6] = 2;
+  dummyKeys[7] = 0;
+  dummyKeys[8] = 5;
+  dummyKeys[9] = 6;
+
+  newValues[0] = glm::vec3(0.0, 0.0, 0.0);
+  newValues[1] = glm::vec3(1.0, 0.0, 0.0);
+  newValues[2] = glm::vec3(2.0, 0.0, 0.0);
+  newValues[3] = glm::vec3(3.0, 0.0, 0.0);
+  newValues[4] = glm::vec3(4.0, 0.0, 0.0);
+  newValues[5] = glm::vec3(5.0, 0.0, 0.0);
+  newValues[6] = glm::vec3(6.0, 0.0, 0.0);
+  newValues[7] = glm::vec3(7.0, 0.0, 0.0);
+  newValues[8] = glm::vec3(8.0, 0.0, 0.0);
+  newValues[9] = glm::vec3(9.0, 0.0, 0.0);
+
   cudaMalloc((void**)&dev_intKeys, N * sizeof(int));
   checkCUDAErrorWithLine("cudaMalloc dev_intKeys failed!");
 
+  cudaMalloc((void**)&dev_dummyKeys, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_dummyKeys failed!");
+
   cudaMalloc((void**)&dev_intValues, N * sizeof(int));
   checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!");
 
+  cudaMalloc((void**)&dev_newValues, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_newValues failed!");
+  
+
+
   dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
 
   std::cout << "before unstable sort: " << std::endl;
   for (int i = 0; i < N; i++) {
     std::cout << "  key: " << intKeys[i];
-    std::cout << " value: " << intValues[i] << std::endl;
+    std::cout << " value: " << intValues[i];
+    std::cout << " newValue: " << newValues[i].x << std::endl;
   }
 
   // How to copy data to the GPU
   cudaMemcpy(dev_intKeys, intKeys.get(), sizeof(int) * N, cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_dummyKeys, dummyKeys.get(), sizeof(int) * N, cudaMemcpyHostToDevice);
   cudaMemcpy(dev_intValues, intValues.get(), sizeof(int) * N, cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_newValues, newValues.get(), sizeof(glm::vec3) * N, cudaMemcpyHostToDevice);
+
+  /*for (int i = 0; i < N; i++) {
+    dev_dummyKeys[i] = dev_intKeys[i];
+  }*/
+  //dev_dummyKeys = dev_intKeys;
+  //dev_dummyKeys[0] = 0;
 
   // Wrap device vectors in thrust iterators for use with thrust.
   thrust::device_ptr<int> dev_thrust_keys(dev_intKeys);
+  //thrust::device_ptr<int> dev_thrust_dummykeys(dev_dummyKeys);
   thrust::device_ptr<int> dev_thrust_values(dev_intValues);
+  thrust::device_ptr<glm::vec3> dev_thrust_newvalues(dev_newValues);
   // LOOK-2.1 Example for using thrust::sort_by_key
   thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + N, dev_thrust_values);
+  //thrust::sort_by_key(dev_thrust_dummykeys, dev_thrust_dummykeys + N, dev_thrust_newvalues);
+
 
   // How to copy data back to the CPU side from the GPU
   cudaMemcpy(intKeys.get(), dev_intKeys, sizeof(int) * N, cudaMemcpyDeviceToHost);
+  cudaMemcpy(dummyKeys.get(), dev_dummyKeys, sizeof(int) * N, cudaMemcpyDeviceToHost);
   cudaMemcpy(intValues.get(), dev_intValues, sizeof(int) * N, cudaMemcpyDeviceToHost);
+  cudaMemcpy(newValues.get(), dev_newValues, sizeof(glm::vec3) * N, cudaMemcpyDeviceToHost);
   checkCUDAErrorWithLine("memcpy back failed!");
 
   std::cout << "after unstable sort: " << std::endl;
   for (int i = 0; i < N; i++) {
     std::cout << "  key: " << intKeys[i];
-    std::cout << " value: " << intValues[i] << std::endl;
+    std::cout << " value: " << intValues[i];
+    std::cout << " newValue: " << newValues[i].x << std::endl;
   }
 
   // cleanup
   cudaFree(dev_intKeys);
+  cudaFree(dev_dummyKeys);
   cudaFree(dev_intValues);
+  cudaFree(dev_newValues);
   checkCUDAErrorWithLine("cudaFree failed!");
   return;
 }
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..5c32af4 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,12 +13,14 @@
 // ================
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
-#define VISUALIZE 1
-#define UNIFORM_GRID 0
+#define VISUALIZE 0
+#define UNIFORM_GRID 1
 #define COHERENT_GRID 0
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
-const int N_FOR_VIS = 5000;
+//const int N_FOR_VIS = 20000;
+const int N_FOR_VIS = 500;
+//const float DT = 0.5f;
 const float DT = 0.2f;
 
 /**
diff --git a/src/main.hpp b/src/main.hpp
index 88e9df7..8f62438 100644
--- a/src/main.hpp
+++ b/src/main.hpp
@@ -36,9 +36,11 @@ const float fovy = (float) (PI / 4);
 const float zNear = 0.10f;
 const float zFar = 10.0f;
 // LOOK-1.2: for high DPI displays, you may want to double these settings.
-int width = 1280;
-int height = 720;
-int pointSize = 2;
+int width = 3840;
+int height = 2160;
+//int width = 2560;
+//int height = 1440;
+int pointSize = 4;
 
 // For camera controls
 bool leftMousePressed = false;