Skip to content

Commit bbdb5cc

Browse files
etwestGillgamesh
andauthored
Reproducibility for ALENEX (#22)
* initial code for runme.sh * code for building * cleanup old code * more reprod progress * scripting for volume setup * more progress on runme * install mpi on main node * sync * bug fixes * more bug fixes * cleanup cmake and fix vol_id bug * more bug fixing * exit if error * sleep between attach and mount * provision better volume * more runme progress * more progress on experiment scripts * forgot some files * automate security group and fix project name * Update README.md * start of python script for creating workers * more progress on experiments and aws scripts * python scripts * silly typo made it in somehow * missing variable calls removed * Fixed getting instance Ids + command * put in the worker spawning/starting/stopping code * updated ami id TODO make sure its configurable * added wait until instances running * added wait on initial tagging too * forgot that we werent in the aws folder * termination script * main node tagging * ignore terminated nodes when fetching instance ids * fully working experiments? * added security groups option * worker args ready to go * actually calling terminate * little fixes * little fixes * added sg setting for main node * more fixes for scripting * wait after create * more small fixes * small fixes * setup cluster stuffs * typo * typo * typos and sleep * remove pauses * fix printing errors * don't wait * fixes to output formats * typo * bug fixes * fix query expr * fix things * ablative fix * typo * typo * more fixes yay * adjust plotting * improve pdf look * limited experiments option * small adjustment * some error checking * Update README.md * Update README.md * more adjustments for reproducibility * Update README.md * Update README.md --------- Co-authored-by: Gilvir Gill <gilvirgillgg@gmail.com>
1 parent 7a6dcf9 commit bbdb5cc

31 files changed

+1144
-318
lines changed

CMakeLists.txt

Lines changed: 43 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,11 @@ set(CMAKE_CXX_STANDARD 14)
55
set(CMAKE_CXX_STANDARD_REQUIRED ON)
66
set(CMAKE_CXX_EXTENSIONS ON)
77

8-
project(DistributedStreamingCC)
8+
project(Landscape)
9+
10+
# Recommend not to set these options. They are for our ablative experiments
11+
# USE_CUBE: Use the CubeSketch sampling algorithm
12+
# NO_STANDALONE: Use StandAloneGutters as the guttering system
913

1014
# Make the default build type Release. If user or another
1115
# project sets a different value than use that
@@ -14,7 +18,7 @@ if(NOT CMAKE_BUILD_TYPE)
1418
set(CMAKE_BUILD_TYPE "Release" CACHE
1519
STRING "Choose the type of build." FORCE)
1620
endif()
17-
message(STATUS "DistributedCC Build Type ${CMAKE_BUILD_TYPE}")
21+
message(STATUS "Landscape Build Type ${CMAKE_BUILD_TYPE}")
1822

1923
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
2024
message(STATUS "Adding GNU compiler flags")
@@ -49,9 +53,15 @@ FetchContent_Declare(
4953
GIT_TAG v2.0.0
5054
)
5155
FetchContent_MakeAvailable(GraphZeppelin)
52-
# Uncomment the next two lines to use CubeSketch algorithm
53-
# target_compile_definitions(GraphZeppelin PUBLIC L0_SAMPLING)
54-
# target_compile_definitions(GraphZeppelinVerifyCC PUBLIC L0_SAMPLING)
56+
57+
if (USE_CUBE)
58+
message(STATUS "Using CubeSketch sampling algorithm")
59+
target_compile_definitions(GraphZeppelin PUBLIC L0_SAMPLING)
60+
target_compile_definitions(GraphZeppelinVerifyCC PUBLIC L0_SAMPLING)
61+
endif()
62+
if (USE_STANDALONE)
63+
message(STATUS "Using StandAlone Gutters for gts")
64+
endif()
5565

5666
if (BUILD_BENCH)
5767
# Get Google Benchmark
@@ -68,47 +78,51 @@ endif()
6878

6979
# The library for distributing the CPU work for
7080
# generating sketch deltas
71-
add_library(DistribUpdateStreamingCC
81+
add_library(Landscape
7282
src/worker_cluster.cpp
7383
src/work_distributor.cpp
7484
src/distributed_worker.cpp
7585
src/message_forwarders.cpp
7686
src/graph_distrib_update.cpp
7787
)
78-
add_dependencies(DistribUpdateStreamingCC GraphZeppelin)
79-
target_link_libraries(DistribUpdateStreamingCC PUBLIC GraphZeppelin ${MPI_LIBRARIES})
80-
target_include_directories(DistribUpdateStreamingCC PUBLIC include/ ${MPI_C_INCLUDE_PATH})
81-
# TODO: Is MPI INCLUDE PATH necessary?
88+
add_dependencies(Landscape GraphZeppelin)
89+
target_link_libraries(Landscape PUBLIC GraphZeppelin ${MPI_LIBRARIES})
90+
target_include_directories(Landscape PUBLIC include/ ${MPI_C_INCLUDE_PATH})
8291
if(MPI_COMPILE_FLAGS)
83-
set_target_properties(DistribUpdateStreamingCC PROPERTIES
92+
set_target_properties(Landscape PROPERTIES
8493
COMPILE_FLAGS "${MPI_COMPILE_FLAGS}")
8594
endif()
8695
if(MPI_LINK_FLAGS)
87-
set_target_properties(DistribUpdateStreamingCC PROPERTIES
96+
set_target_properties(Landscape PROPERTIES
8897
LINK_FLAGS "${MPI_LINK_FLAGS}")
8998
endif()
99+
if (USE_STANDALONE)
100+
target_compile_definitions(Landscape PUBLIC USE_STANDALONE)
101+
endif()
90102

91103
# A library for testing our code for distributing
92104
# generating sketch deltas
93-
add_library(DistribUpdateStreamingVerifyCC
105+
add_library(LandscapeVerify
94106
src/worker_cluster.cpp
95107
src/work_distributor.cpp
96108
src/distributed_worker.cpp
97109
src/message_forwarders.cpp
98110
src/graph_distrib_update.cpp
99111
)
100-
add_dependencies(DistribUpdateStreamingVerifyCC GraphZeppelinVerifyCC)
101-
target_link_libraries(DistribUpdateStreamingVerifyCC PUBLIC GraphZeppelinVerifyCC ${MPI_LIBRARIES})
102-
target_include_directories(DistribUpdateStreamingVerifyCC PUBLIC include/ ${MPI_C_INCLUDE_PATH})
103-
# TODO: Is MPI INCLUDE PATH necessary?
112+
add_dependencies(LandscapeVerify GraphZeppelinVerifyCC)
113+
target_link_libraries(LandscapeVerify PUBLIC GraphZeppelinVerifyCC ${MPI_LIBRARIES})
114+
target_include_directories(LandscapeVerify PUBLIC include/ ${MPI_C_INCLUDE_PATH})
104115
if(MPI_COMPILE_FLAGS)
105-
set_target_properties(DistribUpdateStreamingVerifyCC PROPERTIES
116+
set_target_properties(LandscapeVerify PROPERTIES
106117
COMPILE_FLAGS "${MPI_COMPILE_FLAGS}")
107118
endif()
108119
if(MPI_LINK_FLAGS)
109-
set_target_properties(DistribUpdateStreamingVerifyCC PROPERTIES
120+
set_target_properties(LandscapeVerify PROPERTIES
110121
LINK_FLAGS "${MPI_LINK_FLAGS}")
111122
endif()
123+
if (USE_STANDALONE)
124+
target_compile_definitions(LandscapeVerify PUBLIC USE_STANDALONE)
125+
endif()
112126

113127
add_executable(distrib_tests
114128
test/distributed_graph_test.cpp
@@ -117,32 +131,32 @@ add_executable(distrib_tests
117131
${GraphZeppelin_SOURCE_DIR}/test/util/graph_gen.cpp
118132
${GraphZeppelin_SOURCE_DIR}/test/util/file_graph_verifier.cpp
119133
)
120-
add_dependencies(distrib_tests DistribUpdateStreamingVerifyCC)
121-
target_link_libraries(distrib_tests PUBLIC DistribUpdateStreamingVerifyCC)
134+
add_dependencies(distrib_tests LandscapeVerify)
135+
target_link_libraries(distrib_tests PUBLIC LandscapeVerify)
122136

123137
add_executable(speed_expr
124138
experiment/cluster_speed_expr.cpp
125139
)
126-
add_dependencies(speed_expr DistribUpdateStreamingCC)
127-
target_link_libraries(speed_expr PUBLIC DistribUpdateStreamingCC)
140+
add_dependencies(speed_expr Landscape)
141+
target_link_libraries(speed_expr PUBLIC Landscape)
128142

129143
add_executable(k_speed_expr
130144
experiment/cluster_k_connect_expr.cpp
131145
)
132-
add_dependencies(k_speed_expr DistribUpdateStreamingCC)
133-
target_link_libraries(k_speed_expr PUBLIC DistribUpdateStreamingCC)
146+
add_dependencies(k_speed_expr Landscape)
147+
target_link_libraries(k_speed_expr PUBLIC Landscape)
134148

135149
add_executable(query_expr
136150
experiment/cluster_query_expr.cpp
137151
)
138-
add_dependencies(query_expr DistribUpdateStreamingCC)
139-
target_link_libraries(query_expr PUBLIC DistribUpdateStreamingCC)
152+
add_dependencies(query_expr Landscape)
153+
target_link_libraries(query_expr PUBLIC Landscape)
140154

141155
add_executable(correctness_expr
142156
experiment/cont_expr.cpp
143157
)
144-
add_dependencies(correctness_expr DistribUpdateStreamingVerifyCC)
145-
target_link_libraries(correctness_expr PUBLIC DistribUpdateStreamingVerifyCC)
158+
add_dependencies(correctness_expr LandscapeVerify)
159+
target_link_libraries(correctness_expr PUBLIC LandscapeVerify)
146160

147161
#####################################################################
148162
##################### Streaming utilities #####################

README.md

Lines changed: 32 additions & 119 deletions
Original file line numberDiff line numberDiff line change
@@ -1,124 +1,37 @@
11
# Landscape
22
Linear sketching for the connected components and k-edge connectivity problems. Landscape distributes the CPU intensive work of performing sketch updates to many worker nodes while keeping sketch data on the main node. The result is that we can process graph update streams at near sequential RAM bandwidth speeds.
33

4-
## Running experiments
5-
1. If the stream lives in a file, ensure that the file has been brought into the file cache before beginning the experiment. One way to do this is `cat 'stream_file' > /dev/null`
6-
2. You can monitor the status of the cluster by, in a seperate window, running the command `watch -n 1 cat cluster_status.txt`
7-
8-
## Cluster Provisioning
9-
### Ensure the master is able to read IPS
10-
There is an IAM Role that allows the EC2 instance to read IPS. This is used to automatically get the IPS.
11-
![image](https://user-images.githubusercontent.com/4708326/164508403-70fbb271-fa4c-4145-9093-ff86320e1bba.png)
12-
13-
### Tag the Master and Worker nodes
14-
15-
The script only reads properly tagged EC2 instances. The Master must be tagged 'ClusterNodeType:Master' to appear at the top of the host files. The Workers must be tagged 'ClusterNodeType:Worker'.
16-
17-
![image](https://user-images.githubusercontent.com/4708326/164511717-02f2feee-a9f8-4b04-a35e-fb53be5140ee.png)
18-
19-
## Main Node Installation and Setup
20-
21-
### 1. Install packages
4+
## Using Landscape
5+
Landscape is a c++ library built with cmake. You can easily use Landscape in your code through cmake with FetchContent or ExternalProjectAdd.
6+
Requirements
7+
- OS: Unix (Not Mac): Tested on Amazon Linux and Ubuntu
8+
- cmake >= 3.16
9+
- openmpi 4.1.3
10+
- c++14
11+
12+
## Reproducing Our Experiments on EC2
13+
Landscape appears in [ALENEX'25](). You can reproduce our paper's experimental results by following these instructions. You will need access to an AWS account with roughly $60 in credits.
14+
15+
1. Create an AWS Secret Key. `IAM->Users->YourUsername->Security credentials`. Make note of the access key and secret key.
16+
2. Provision the Main Node on EC2. `EC2->Instances->Launch instances`
17+
- Select the Amazon Linux 2023 AMI. (That is, not Amazon Linux 2 AMI)
18+
- Choose `c5n.18xlarge` as the instance type.
19+
- Create a new key pair. Select `RSA` and call this key `Creation_Key`. (If you have already created this key pair then skip this step)
20+
- Select `Creation_Key` as the key pair.
21+
- Under Advanced details select create new placement group. Call the group `DistributedStreaming` and select `Cluster` as the placement strategy. (If you have already created this placement group then skip this step)
22+
- Select the `DistributedStreaming` placement group.
23+
4. Upload the ssh keypair to the main node. `rsync -ve "ssh -i </path/to/key>" </path/to/key> ec2-user@<public-dns-addr-of-main>:~/.ssh/id_rsa`
24+
- Find the public dns address `EC2->Instances->click instance->PublicIPv4 DNS`.
25+
5. Connect to the main node. `ssh -i <path/to/key> ec2-user@<public-dns-addr-of-main>`
26+
6. Install packages
2227
```
2328
sudo yum update -y
24-
sudo yum install -y tmux htop git gcc-c++ jq python3-pip
25-
pip install ansible
26-
```
27-
28-
### 2. Install cmake version 3.16+
29-
First Step:
30-
#### x86_64
31-
```
32-
wget https://github.com/Kitware/CMake/releases/download/v3.23.0-rc2/cmake-3.23.0-rc2-linux-x86_64.sh
33-
sudo mkdir /opt/cmake
34-
sudo sh cmake-3.23.0-rc2-linux-x86_64.sh --prefix=/opt/cmake
35-
```
36-
#### aarch64
37-
```
38-
wget https://github.com/Kitware/CMake/releases/download/v3.23.0-rc5/cmake-3.23.0-rc5-linux-aarch64.sh
39-
sudo mkdir /opt/cmake
40-
sudo sh cmake-3.23.0-rc5-linux-aarch64.sh --prefix=/opt/cmake
41-
```
42-
Second Step:
43-
```
44-
sudo ln -s /opt/cmake/bin/cmake /usr/local/bin/cmake
45-
```
46-
When running cmake .sh script enter y to license and n to install location.
47-
These commands install cmake version 3.23 but any version >= 3.16 will work.
48-
49-
### 4. Setup ssh keys
50-
* Copy EMR.pem to cluster `rsync -ve "ssh -i </path/to/EMR.pem>" </path/to/EMR.pem> <AWS-user>@<main_node_dns_addr>:.`
51-
* Ensure key being used is default rsa key for ssh `id_rsa` for example `cp EMR.pem ~/.ssh/id_rsa`
52-
53-
### 5. Clone DistributedStreamingCC Repo
54-
55-
## Cluster Setup
56-
57-
### Run `setup_tagged_workers.sh`
58-
This bash script will construct the ansible `inventory.ini` file; and the MPI `hostfile` and `rankfile`. The arguments to the script are the EC2 region where our machines are, number of physical CPUs on the main node, and number of physical CPUs on the worker nodes.
59-
60-
Example:
61-
```
62-
./setup_tagged_workers.sh us-west-2 36 8
63-
```
64-
The script will automatically set the known_hosts for all the machines in the cluster to whatever ssh-keyscan finds (this is a slight security issue if you don't trust the cluster but should be fine as we aren't transmitting sensative data). It will additionally confirm with you that the `inventory.ini` and `hostfile` it creates look reasonable.
65-
66-
### Run unit tests
67-
After running the setup script you should be able to run the unit tests from the build directory.
68-
```
69-
mpirun -np 22 -hostfile hostfile -rf rankfile ./distrib_tests
70-
```
71-
-np denotes the number of processes to run. Should be number of worker nodes +21.
72-
73-
## Cluster Setup (Manual)
74-
Ansible files for setting up the cluster are found under `tools/ansible`.
75-
Ansible commands are run with `ansible-playbook -i /path/to/inventory.ini /path/to/<script>.yaml`.
76-
77-
### 1. Distribute ssh keys to cluster
78-
* Run ansible file `ssh.yaml` with `ansible-playbook -i inventory.ini DistributedStreamingCC/tools/ansible/ssh.yaml`
79-
80-
### 2. Install MPI on nodes in cluster
81-
* Run ansible file `mpi.yaml` with `ansible-playbook -i inventory.ini DistributedStreamingCC/tools/ansible/mpi.yaml`
82-
* Run `source ~/.bashrc` in open terminal on main node
83-
84-
### 3. Build Distributed Streaming Repo
85-
* make `build` directory in project repo
86-
* run `cmake .. ; make -j` in build directory
87-
88-
### 4. Distribute executables and hostfile to worker nodes
89-
* Run ansible file `files.yaml` with `ansible-playbook -i inventory.ini DistributedStreamingCC/tools/ansible/files.yaml`
90-
91-
### EFA Installation instructions
92-
* Follow the instructions at https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start.html#efa-start-security
93-
94-
## Amazon Storage
95-
### EBS Storage
96-
EBS disks are generally found installed at `/mnt/nvmeXnX` where X is the disk number. In order to use them, the disk must be formatted and then mounted.
97-
* `sudo lsblk -f` to list all devices
98-
* (Optional) If no filesystem exists on the device than run `sudo mkfs -t xfs /dev/<device>` to format the drive. This will overwrite the content on the device so DO NOT do this if a filesystem already exists.
99-
* Create the mount point directory `sudo mkdir /mnt/<mnt_point>`
100-
* Mount the device `sudo mount /dev/<device> /mnt/<mnt_point>`
101-
* Adjust owner and permissions of mount point `sudo chown -R <user> /mnt/<mnt_point>` and `chmod a+rw /mnt/<mnt_point>`
102-
103-
## Single Machine Setup
104-
105-
### 1. Install OpenMPI
106-
For Ubuntu the following will install openmpi
107-
```
108-
sudo apt update
109-
sudo apt install libopenmpi-dev
110-
```
111-
Google is your friend for other operating systems :)
112-
113-
### 2. Run executables
114-
Use the `mpirun` command to run mpi programs. For example, to run the unit tests with 4 processes, the following command is used.
115-
```
116-
mpirun -np 4 ./distrib_tests
117-
```
118-
119-
## Tips for Debugging with MPI
120-
If you want to run the code using a debugging tool like gdb you can perform the following steps.
121-
1. Compile with debugging flags `cmake -DCMAKE_BUILD_TYPE=Debug .. ; make`
122-
2. Launch the mpi task with each process in its own window using xterm `mpirun -np <num_proc> term -hold -e gdb <executable>`
123-
124-
Print statement debugging can also be helpful, as even when running in a cluster across many machines, all the output to console across the workers is printed out by the main process.
29+
sudo yum install -y tmux git
30+
```
31+
7. Clone this repository. IMPORTANT: Ensure the repository is cloned to the ec2-user home directory and that the name is unchanged. `~/Landscape`
32+
8. From `~\Landscape` run `bash runme.sh`. This script will prompt you for the following:
33+
- Agree to the use of sudo commands
34+
- Choose whether to run the `full` experiments (all datapoints) or `limited` experiments (fewer datapoints per experiments)
35+
- Enter your aws secret key and default EC2 region (this should be the region in which the main node was created)
36+
9. After the experiments conclude, copy `figures.pdf` from `~/Landscape` to your personal computer. You can acomplish this by running: `rsync -ve "ssh -i ~/.ssh/Creation_Key.pem" ec2-user@<publis-dns-addr-of-main>:~/Landscape/figures.pdf .` on your personal computer.
37+
10. Terminate the main node in EC2

experiment/cluster_k_connect_expr.cpp

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -105,21 +105,23 @@ int main(int argc, char** argv) {
105105
for (node_id_t src = 0; src < num_nodes; src++) {
106106
edges += sf_adj[src].size();
107107
}
108-
109-
std::ofstream out{output, std::ofstream::out | std::ofstream::app}; // open the outfile
110108
std::cout << "number of spanning forest edges: " << edges << std::endl;
111-
std::cout << "Writing runtime stats to " << output << std::endl;
112109

113-
// calculate the insertion rate and write to file
110+
// calculate the insertion rate and print
114111
// insertion rate measured in stream updates
115112
// (not in the two sketch updates we process per stream update)
116113
float ins_per_sec = (((float)(total * repeats)) / runtime.count());
117-
out << "Procesing " << total * repeats << " updates took " << runtime.count() << " seconds, "
118-
<< ins_per_sec << " per second\n";
119114

120-
out << "Finding " << num_forests << " Spanning Forests took " << CC_time.count()
121-
<< " and found " << edges << " edges\n";
122-
out << "Total Memory used (MiB): " << get_max_mem_used() << std::endl;
115+
std::cout << "Processing " << total * repeats << " updates took " << runtime.count()
116+
<< " seconds, " << ins_per_sec << " per second\n";
117+
118+
std::cout << "Finding " << num_forests << " Spanning Forests took " << CC_time.count()
119+
<< " and found " << edges << " edges\n";
120+
121+
std::ofstream out{output, std::ofstream::out | std::ofstream::app}; // open the outfile
122+
std::cout << "Writing runtime stats to " << output << std::endl;
123+
124+
out << std::fixed << ins_per_sec / 1e6 << ", " << CC_time.count() << ", " << get_max_mem_used() << std::endl;
123125
out.close();
124126
} else {
125127
node_id_t num_nodes = std::stoull(argv[4]);
@@ -180,21 +182,19 @@ int main(int argc, char** argv) {
180182
for (node_id_t src = 0; src < num_nodes; src++) {
181183
edges += sf_adj[src].size();
182184
}
185+
std::cout << "number of spanning forest edges: " << edges << std::endl;
186+
float ins_per_sec = (((float)(num_edges)) / runtime.count());
187+
188+
std::cout << "Processing " << num_edges << " updates took " << runtime.count()
189+
<< " seconds, " << ins_per_sec << " per second\n";
190+
191+
std::cout << "Finding " << num_forests << " Spanning Forests took " << CC_time.count()
192+
<< " and found " << edges << " edges\n";
183193

184194
std::ofstream out{output, std::ofstream::out | std::ofstream::app}; // open the outfile
185-
std::cout << "number of spanning forest edges: " << edges << std::endl;
186195
std::cout << "Writing runtime stats to " << output << std::endl;
187196

188-
// calculate the insertion rate and write to file
189-
// insertion rate measured in stream updates
190-
// (not in the two sketch updates we process per stream update)
191-
float ins_per_sec = (float(num_edges) / runtime.count());
192-
out << "Procesing " << edges << " updates took " << runtime.count() << " seconds, "
193-
<< ins_per_sec << " per second\n";
194-
195-
out << "Finding " << num_forests << " Spanning Forests took " << CC_time.count()
196-
<< " and found " << edges << " edges\n";
197-
out << "Total Memory used (MiB): " << get_max_mem_used() << std::endl;
197+
out << std::fixed << ins_per_sec / 1e6 << ", " << CC_time.count() << ", " << get_max_mem_used() << std::endl;
198198
out.close();
199199
}
200200

0 commit comments

Comments
 (0)