GraphStreamingProject
diff --git a/‎CMakeLists.txt‎
Lines changed: 43 additions & 29 deletions b/‎CMakeLists.txt‎
Lines changed: 43 additions & 29 deletions
diff --git a/‎README.md‎
Lines changed: 32 additions & 119 deletions b/‎README.md‎
Lines changed: 32 additions & 119 deletions
diff --git a/‎experiment/cluster_k_connect_expr.cpp‎
Lines changed: 20 additions & 20 deletions b/‎experiment/cluster_k_connect_expr.cpp‎
Lines changed: 20 additions & 20 deletions
@@ -5,7 +5,11 @@ set(CMAKE_CXX_STANDARD 14)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS ON)
 
-project(DistributedStreamingCC)
+project(Landscape)
+
+# Recommend not to set these options. They are for our ablative experiments
+# USE_CUBE:        Use the CubeSketch sampling algorithm
+# NO_STANDALONE:   Use StandAloneGutters as the guttering system
 
 # Make the default build type Release. If user or another
 # project sets a different value than use that
@@ -14,7 +18,7 @@ if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE "Release" CACHE
       STRING "Choose the type of build." FORCE)
 endif()
-message(STATUS "DistributedCC Build Type ${CMAKE_BUILD_TYPE}")
+message(STATUS "Landscape Build Type ${CMAKE_BUILD_TYPE}")
 
 if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
   message(STATUS "Adding GNU compiler flags")
@@ -49,9 +53,15 @@ FetchContent_Declare(
   GIT_TAG             v2.0.0
 )
 FetchContent_MakeAvailable(GraphZeppelin)
-# Uncomment the next two lines to use CubeSketch algorithm
-# target_compile_definitions(GraphZeppelin PUBLIC L0_SAMPLING)
-# target_compile_definitions(GraphZeppelinVerifyCC PUBLIC L0_SAMPLING)
+
+if (USE_CUBE)
+  message(STATUS "Using CubeSketch sampling algorithm")
+  target_compile_definitions(GraphZeppelin PUBLIC L0_SAMPLING)
+  target_compile_definitions(GraphZeppelinVerifyCC PUBLIC L0_SAMPLING)
+endif()
+if (USE_STANDALONE)
+  message(STATUS "Using StandAlone Gutters for gts")
+endif()
 
 if (BUILD_BENCH)
   # Get Google Benchmark
@@ -68,47 +78,51 @@ endif()
 
 # The library for distributing the CPU work for
 # generating sketch deltas
-add_library(DistribUpdateStreamingCC
+add_library(Landscape
   src/worker_cluster.cpp
   src/work_distributor.cpp
   src/distributed_worker.cpp
   src/message_forwarders.cpp
   src/graph_distrib_update.cpp
 )
-add_dependencies(DistribUpdateStreamingCC GraphZeppelin)
-target_link_libraries(DistribUpdateStreamingCC PUBLIC GraphZeppelin ${MPI_LIBRARIES})
-target_include_directories(DistribUpdateStreamingCC PUBLIC include/ ${MPI_C_INCLUDE_PATH})
-# TODO: Is MPI INCLUDE PATH necessary?
+add_dependencies(Landscape GraphZeppelin)
+target_link_libraries(Landscape PUBLIC GraphZeppelin ${MPI_LIBRARIES})
+target_include_directories(Landscape PUBLIC include/ ${MPI_C_INCLUDE_PATH})
 if(MPI_COMPILE_FLAGS)
-  set_target_properties(DistribUpdateStreamingCC PROPERTIES
+  set_target_properties(Landscape PROPERTIES
     COMPILE_FLAGS "${MPI_COMPILE_FLAGS}")
 endif()
 if(MPI_LINK_FLAGS)
-  set_target_properties(DistribUpdateStreamingCC PROPERTIES
+  set_target_properties(Landscape PROPERTIES
     LINK_FLAGS "${MPI_LINK_FLAGS}")
 endif()
+if (USE_STANDALONE)
+  target_compile_definitions(Landscape PUBLIC USE_STANDALONE)
+endif()
 
 # A library for testing our code for distributing
 # generating sketch deltas
-add_library(DistribUpdateStreamingVerifyCC
+add_library(LandscapeVerify
   src/worker_cluster.cpp
   src/work_distributor.cpp
   src/distributed_worker.cpp
   src/message_forwarders.cpp
   src/graph_distrib_update.cpp
 )
-add_dependencies(DistribUpdateStreamingVerifyCC GraphZeppelinVerifyCC)
-target_link_libraries(DistribUpdateStreamingVerifyCC PUBLIC GraphZeppelinVerifyCC ${MPI_LIBRARIES})
-target_include_directories(DistribUpdateStreamingVerifyCC PUBLIC include/ ${MPI_C_INCLUDE_PATH})
-# TODO: Is MPI INCLUDE PATH necessary?
+add_dependencies(LandscapeVerify GraphZeppelinVerifyCC)
+target_link_libraries(LandscapeVerify PUBLIC GraphZeppelinVerifyCC ${MPI_LIBRARIES})
+target_include_directories(LandscapeVerify PUBLIC include/ ${MPI_C_INCLUDE_PATH})
 if(MPI_COMPILE_FLAGS)
-  set_target_properties(DistribUpdateStreamingVerifyCC PROPERTIES
+  set_target_properties(LandscapeVerify PROPERTIES
     COMPILE_FLAGS "${MPI_COMPILE_FLAGS}")
 endif()
 if(MPI_LINK_FLAGS)
-  set_target_properties(DistribUpdateStreamingVerifyCC PROPERTIES
+  set_target_properties(LandscapeVerify PROPERTIES
     LINK_FLAGS "${MPI_LINK_FLAGS}")
 endif()
+if (USE_STANDALONE)
+  target_compile_definitions(LandscapeVerify PUBLIC USE_STANDALONE)
+endif()
 
 add_executable(distrib_tests
   test/distributed_graph_test.cpp
@@ -117,32 +131,32 @@ add_executable(distrib_tests
   ${GraphZeppelin_SOURCE_DIR}/test/util/graph_gen.cpp
   ${GraphZeppelin_SOURCE_DIR}/test/util/file_graph_verifier.cpp
 )
-add_dependencies(distrib_tests DistribUpdateStreamingVerifyCC)
-target_link_libraries(distrib_tests PUBLIC DistribUpdateStreamingVerifyCC)
+add_dependencies(distrib_tests LandscapeVerify)
+target_link_libraries(distrib_tests PUBLIC LandscapeVerify)
 
 add_executable(speed_expr
   experiment/cluster_speed_expr.cpp
 )
-add_dependencies(speed_expr DistribUpdateStreamingCC)
-target_link_libraries(speed_expr PUBLIC DistribUpdateStreamingCC)
+add_dependencies(speed_expr Landscape)
+target_link_libraries(speed_expr PUBLIC Landscape)
 
 add_executable(k_speed_expr
   experiment/cluster_k_connect_expr.cpp
 )
-add_dependencies(k_speed_expr DistribUpdateStreamingCC)
-target_link_libraries(k_speed_expr PUBLIC DistribUpdateStreamingCC)
+add_dependencies(k_speed_expr Landscape)
+target_link_libraries(k_speed_expr PUBLIC Landscape)
 
 add_executable(query_expr
   experiment/cluster_query_expr.cpp  
 )
-add_dependencies(query_expr DistribUpdateStreamingCC)
-target_link_libraries(query_expr PUBLIC DistribUpdateStreamingCC)
+add_dependencies(query_expr Landscape)
+target_link_libraries(query_expr PUBLIC Landscape)
 
 add_executable(correctness_expr
   experiment/cont_expr.cpp
 )
-add_dependencies(correctness_expr DistribUpdateStreamingVerifyCC)
-target_link_libraries(correctness_expr PUBLIC DistribUpdateStreamingVerifyCC)
+add_dependencies(correctness_expr LandscapeVerify)
+target_link_libraries(correctness_expr PUBLIC LandscapeVerify)
 
 #####################################################################
 #####################    Streaming utilities    #####################
 
@@ -1,124 +1,37 @@
 # Landscape
 Linear sketching for the connected components and k-edge connectivity problems. Landscape distributes the CPU intensive work of performing sketch updates to many worker nodes while keeping sketch data on the main node. The result is that we can process graph update streams at near sequential RAM bandwidth speeds.
 
-## Running experiments
-1. If the stream lives in a file, ensure that the file has been brought into the file cache before beginning the experiment. One way to do this is `cat 'stream_file' > /dev/null`
-2. You can monitor the status of the cluster by, in a seperate window, running the command `watch -n 1 cat cluster_status.txt`
-
-## Cluster Provisioning
-### Ensure the master is able to read IPS
-There is an IAM Role that allows the EC2 instance to read IPS. This is used to automatically get the IPS.
-![image](https://user-images.githubusercontent.com/4708326/164508403-70fbb271-fa4c-4145-9093-ff86320e1bba.png)
-
-### Tag the Master and Worker nodes
-
-The script only reads properly tagged EC2 instances. The Master must be tagged 'ClusterNodeType:Master' to appear at the top of the host files. The Workers must be tagged 'ClusterNodeType:Worker'.
-
-![image](https://user-images.githubusercontent.com/4708326/164511717-02f2feee-a9f8-4b04-a35e-fb53be5140ee.png)
-
-## Main Node Installation and Setup
-
-### 1. Install packages
+## Using Landscape
+Landscape is a c++ library built with cmake. You can easily use Landscape in your code through cmake with FetchContent or ExternalProjectAdd.
+Requirements
+- OS: Unix (Not Mac): Tested on Amazon Linux and Ubuntu
+- cmake >= 3.16
+- openmpi 4.1.3
+- c++14
+
+## Reproducing Our Experiments on EC2
+Landscape appears in [ALENEX'25](). You can reproduce our paper's experimental results by following these instructions. You will need access to an AWS account with roughly $60 in credits.
+
+1. Create an AWS Secret Key. `IAM->Users->YourUsername->Security credentials`. Make note of the access key and secret key.
+2. Provision the Main Node on EC2. `EC2->Instances->Launch instances`
+   - Select the Amazon Linux 2023 AMI. (That is, not Amazon Linux 2 AMI)
+   - Choose `c5n.18xlarge` as the instance type.
+   - Create a new key pair. Select `RSA` and call this key `Creation_Key`. (If you have already created this key pair then skip this step)
+   - Select `Creation_Key` as the key pair.
+   - Under Advanced details select create new placement group. Call the group `DistributedStreaming` and select `Cluster` as the placement strategy. (If you have already created this placement group then skip this step)
+   - Select the `DistributedStreaming` placement group.
+4. Upload the ssh keypair to the main node. `rsync -ve "ssh -i </path/to/key>" </path/to/key> ec2-user@<public-dns-addr-of-main>:~/.ssh/id_rsa`
+   - Find the public dns address `EC2->Instances->click instance->PublicIPv4 DNS`.
+5. Connect to the main node. `ssh -i <path/to/key> ec2-user@<public-dns-addr-of-main>`
+6. Install packages
 ```
 sudo yum update -y
-sudo yum install -y tmux htop git gcc-c++ jq python3-pip
-pip install ansible
-```
-
-### 2. Install cmake version 3.16+
-First Step:
-#### x86_64
-```
-wget https://github.com/Kitware/CMake/releases/download/v3.23.0-rc2/cmake-3.23.0-rc2-linux-x86_64.sh
-sudo mkdir /opt/cmake
-sudo sh cmake-3.23.0-rc2-linux-x86_64.sh --prefix=/opt/cmake
-```
-#### aarch64
-```
-wget https://github.com/Kitware/CMake/releases/download/v3.23.0-rc5/cmake-3.23.0-rc5-linux-aarch64.sh
-sudo mkdir /opt/cmake
-sudo sh cmake-3.23.0-rc5-linux-aarch64.sh --prefix=/opt/cmake
-```
-Second Step:
-```
-sudo ln -s /opt/cmake/bin/cmake /usr/local/bin/cmake
-```
-When running cmake .sh script enter y to license and n to install location.  
-These commands install cmake version 3.23 but any version >= 3.16 will work.
-
-### 4. Setup ssh keys
-* Copy EMR.pem to cluster `rsync -ve "ssh -i </path/to/EMR.pem>" </path/to/EMR.pem> <AWS-user>@<main_node_dns_addr>:.`
-* Ensure key being used is default rsa key for ssh `id_rsa` for example `cp EMR.pem ~/.ssh/id_rsa`
-
-### 5. Clone DistributedStreamingCC Repo
-
-## Cluster Setup
-
-### Run `setup_tagged_workers.sh`  
-This bash script will construct the ansible `inventory.ini` file; and the MPI `hostfile` and `rankfile`. The arguments to the script are the EC2 region where our machines are, number of physical CPUs on the main node, and number of physical CPUs on the worker nodes.
-
-Example:
-```
-./setup_tagged_workers.sh us-west-2 36 8
-```
-The script will automatically set the known_hosts for all the machines in the cluster to whatever ssh-keyscan finds (this is a slight security issue if you don't trust the cluster but should be fine as we aren't transmitting sensative data). It will additionally confirm with you that the `inventory.ini` and `hostfile` it creates look reasonable.
-
-### Run unit tests
-After running the setup script you should be able to run the unit tests from the build directory.
-```
-mpirun -np 22 -hostfile hostfile -rf rankfile ./distrib_tests
-```
--np denotes the number of processes to run. Should be number of worker nodes +21.
-
-## Cluster Setup (Manual)
-Ansible files for setting up the cluster are found under `tools/ansible`.  
-Ansible commands are run with `ansible-playbook -i /path/to/inventory.ini /path/to/<script>.yaml`.
-
-### 1. Distribute ssh keys to cluster
-* Run ansible file `ssh.yaml` with `ansible-playbook -i inventory.ini DistributedStreamingCC/tools/ansible/ssh.yaml`
-
-### 2. Install MPI on nodes in cluster
-* Run ansible file `mpi.yaml` with `ansible-playbook -i inventory.ini DistributedStreamingCC/tools/ansible/mpi.yaml`
-* Run `source ~/.bashrc` in open terminal on main node
-
-### 3. Build Distributed Streaming Repo
-* make `build` directory in project repo
-* run `cmake .. ; make -j` in build directory
-
-### 4. Distribute executables and hostfile to worker nodes
-*  Run ansible file `files.yaml` with `ansible-playbook -i inventory.ini DistributedStreamingCC/tools/ansible/files.yaml`
-
-### EFA Installation instructions
-* Follow the instructions at https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start.html#efa-start-security
-
-## Amazon Storage
-### EBS Storage
-EBS disks are generally found installed at `/mnt/nvmeXnX` where X is the disk number. In order to use them, the disk must be formatted and then mounted.
-* `sudo lsblk -f` to list all devices
-* (Optional) If no filesystem exists on the device than run `sudo mkfs -t xfs /dev/<device>` to format the drive. This will overwrite the content on the device so DO NOT do this if a filesystem already exists.
-* Create the mount point directory `sudo mkdir /mnt/<mnt_point>`
-* Mount the device `sudo mount /dev/<device> /mnt/<mnt_point>`
-* Adjust owner and permissions of mount point `sudo chown -R <user> /mnt/<mnt_point>` and `chmod a+rw /mnt/<mnt_point>` 
-
-## Single Machine Setup
-
-### 1. Install OpenMPI
-For Ubuntu the following will install openmpi
-```
-sudo apt update
-sudo apt install libopenmpi-dev
-```
-Google is your friend for other operating systems :)
-
-### 2. Run executables
-Use the `mpirun` command to run mpi programs. For example, to run the unit tests with 4 processes, the following command is used.
-```
-mpirun -np 4 ./distrib_tests
-```
-
-## Tips for Debugging with MPI
-If you want to run the code using a debugging tool like gdb you can perform the following steps.
-1. Compile with debugging flags `cmake -DCMAKE_BUILD_TYPE=Debug .. ; make`
-2. Launch the mpi task with each process in its own window using xterm `mpirun -np <num_proc> term -hold -e gdb <executable>`
-
-Print statement debugging can also be helpful, as even when running in a cluster across many machines, all the output to console across the workers is printed out by the main process. 
+sudo yum install -y tmux git
+```
+7. Clone this repository. IMPORTANT: Ensure the repository is cloned to the ec2-user home directory and that the name is unchanged. `~/Landscape`
+8. From `~\Landscape` run `bash runme.sh`. This script will prompt you for the following:
+   - Agree to the use of sudo commands
+   - Choose whether to run the `full` experiments (all datapoints) or `limited` experiments (fewer datapoints per experiments)
+   - Enter your aws secret key and default EC2 region (this should be the region in which the main node was created)
+9. After the experiments conclude, copy `figures.pdf` from `~/Landscape` to your personal computer. You can acomplish this by running: `rsync -ve "ssh -i ~/.ssh/Creation_Key.pem" ec2-user@<publis-dns-addr-of-main>:~/Landscape/figures.pdf .` on your personal computer.
+10. Terminate the main node in EC2
@@ -105,21 +105,23 @@ int main(int argc, char** argv) {
     for (node_id_t src = 0; src < num_nodes; src++) {
       edges += sf_adj[src].size();
     }
-
-    std::ofstream out{output, std::ofstream::out | std::ofstream::app};  // open the outfile
     std::cout << "number of spanning forest edges: " << edges << std::endl;
-    std::cout << "Writing runtime stats to " << output << std::endl;
 
-    // calculate the insertion rate and write to file
+    // calculate the insertion rate and print
     // insertion rate measured in stream updates
     // (not in the two sketch updates we process per stream update)
     float ins_per_sec = (((float)(total * repeats)) / runtime.count());
-    out << "Procesing " << total * repeats << " updates took " << runtime.count() << " seconds, "
-        << ins_per_sec << " per second\n";
 
-    out << "Finding " << num_forests << " Spanning Forests took " << CC_time.count()
-        << " and found " << edges << " edges\n";
-    out << "Total Memory used (MiB): " << get_max_mem_used() << std::endl;
+    std::cout << "Processing " << total * repeats << " updates took " << runtime.count()
+              << " seconds, " << ins_per_sec << " per second\n";
+
+    std::cout << "Finding " << num_forests << " Spanning Forests took " << CC_time.count()
+              << " and found " << edges << " edges\n";
+
+    std::ofstream out{output, std::ofstream::out | std::ofstream::app};  // open the outfile
+    std::cout << "Writing runtime stats to " << output << std::endl;
+
+    out << std::fixed << ins_per_sec / 1e6 << ", " << CC_time.count() << ", " << get_max_mem_used() << std::endl;
     out.close();
   } else {
     node_id_t num_nodes = std::stoull(argv[4]);
@@ -180,21 +182,19 @@ int main(int argc, char** argv) {
     for (node_id_t src = 0; src < num_nodes; src++) {
       edges += sf_adj[src].size();
     }
+    std::cout << "number of spanning forest edges: " << edges << std::endl;
+    float ins_per_sec = (((float)(num_edges)) / runtime.count());
+
+    std::cout << "Processing " << num_edges << " updates took " << runtime.count()
+              << " seconds, " << ins_per_sec << " per second\n";
+
+    std::cout << "Finding " << num_forests << " Spanning Forests took " << CC_time.count()
+              << " and found " << edges << " edges\n";
 
     std::ofstream out{output, std::ofstream::out | std::ofstream::app};  // open the outfile
-    std::cout << "number of spanning forest edges: " << edges << std::endl;
     std::cout << "Writing runtime stats to " << output << std::endl;
 
-    // calculate the insertion rate and write to file
-    // insertion rate measured in stream updates
-    // (not in the two sketch updates we process per stream update)
-    float ins_per_sec = (float(num_edges) / runtime.count());
-    out << "Procesing " << edges << " updates took " << runtime.count() << " seconds, "
-        << ins_per_sec << " per second\n";
-
-    out << "Finding " << num_forests << " Spanning Forests took " << CC_time.count()
-        << " and found " << edges << " edges\n";
-    out << "Total Memory used (MiB): " << get_max_mem_used() << std::endl;
+    out << std::fixed << ins_per_sec / 1e6 << ", " << CC_time.count() << ", " << get_max_mem_used() << std::endl;
     out.close();
   }