diff --git a/Project.toml b/Project.toml index 86ea0a5..2e084bf 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,17 @@ name = "RidgeRegression" uuid = "739161c8-60e1-4c49-8f89-ff30998444b1" -authors = ["Vivak Patel "] version = "0.1.0" +authors = ["Eton Tackett ", "Vivak Patel "] + +[deps] +CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" [compat] +CSV = "0.10.15" +DataFrames = "1.8.1" +Downloads = "1.7.0" +LinearAlgebra = "1.12.0" julia = "1.12.4" diff --git a/docs/make.jl b/docs/make.jl index a1097bb..d42cfbe 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -14,6 +14,7 @@ makedocs(; ), pages=[ "Home" => "index.md", + "Design" => "design.md", ], ) diff --git a/docs/src/design.md b/docs/src/design.md new file mode 100644 index 0000000..794fece --- /dev/null +++ b/docs/src/design.md @@ -0,0 +1,93 @@ +# Motivation and Background +Many modern science problems involve regression problems with extremely large numbers of predictors. Genome-wide association studies (GWAS), for example, try to identify genetic variants associated with a disease phenotype using hundreds of thousands or millions of genomic features. In such settings, traditional least squares methods fail because noise and ill-conditioning. Penalized Least Squares (PLS) extends ordinary least squares (OLS) regression by adding a penalty term to shrink parameter estimates. Ridge regression, an approach within PLS, adds a regularization term, producing a regularized estimator. + +Mathematically, ridge regression estimates the regression coefficients by solving the penalized least squares problem +${ +\hat{\boldsymbol{\beta}} = +\arg\min_{\boldsymbol{\beta}} +\left( +\| \mathbf{y} - X\boldsymbol{\beta} \|^2 ++ +\lambda \| \boldsymbol{\beta} \|^2 +\right)} +$ +where $\lambda > 0$ is a regularization parameter that controls the strength of the penalty. + +The purpose of ridge regression is to stabilize regression estimates where the predictors are highly correlated or the design matrix $X$ is almost singular. Ridge regression shrinks the estimated coefficient vector in a way such that the coefficient estimates minimize the sum of squared residuals subject to a constraint on the $\ell_2$ norm of the coefficient vector, $\|\boldsymbol{\beta}\|^2 \leq t$, which shrinks the least squares estimates toward the origin. This reduces the variance of the coefficient estimates and mitigates the effects of multicollinearity. + +There are many numerical algorithms available to compute ridge regression estimates including direct methods, Krylov subspace methods, gradient-based optimization, coordinate descent, and stochastic gradient descent. These algorithms differ in their computational costs and numerical stability. + +The goal of this experiment is to investigate the performance of these algorithms when we vary the structure and scale of the regression problem. To do this, we consider the linear model $\mathbf{y} = X\boldsymbol{\beta} + \boldsymbol{\varepsilon}$ where the matrix ${X}$ may be constructed with varying dimensions, sparsity patterns, and conditioning properties. +# Questions +The primary goal of this experiment is to compare numerical algorithms for computing ridge regression estimates under various conditions. In particular, we aim to address the following questions: + +1. How does the performance of ridge regression algorithms change as the structural and numerical properties of the regression problem vary? + +2. Which ridge regression algorithm provides the best balance between numerical stability and computational cost across these problem regimes? + +# Experimental Units +The experimental units are the datasets under fixed penalty weights. For each experimental unit, all treatments will be applied to the dataset. This will be done so that differences in performance can be attributed to the algorithms themselves rather than the data. Each dataset will contain a matrix ${X}$, a response vector $\mathbf{y}$, and a regularization parameter ${\lambda}$ for some specific ${\lambda}$. + +Blocks are defined by combinations of the experimental blocking factors, including dimensional regime, matrix sparsity, and ridge penalty magnitude. Each block represents datasets with similar structural properties. Within each block, multiple datasets will be generated, and each dataset forms an experimental unit. For every experimental unit all treatments are applied. + +Datasets will be grouped according to their dimensional regime, characterized as $p \ll n$, p ≈ n, and $p \gg n$. These regimes correspond to fundamentally different geometric properties of the design matrix, including rank behavior, conditioning, and the stability of the normal equations. + +In addition to dimensional block, the strength of the ridge penalty will be incorporated as a secondary blocking factor. The ridge estimator is $\hat{\beta_R} = (X^\top X + \lambda I)^{-1}X^\top y$. The matrix conditioning number is defined as $\kappa(A) = \frac{\sigma_{\max}(A)}{\sigma_{\min}(A)}$. In the context of ridge regression, the regularization parameter ${\lambda}$, can impact the conditioning number. Let $X = U\Sigma V^\top$ be the SVD of $X$, with singular values $\sigma_1,\dots,\sigma_p$. + +Then +```math +X^\top X = V \Sigma^\top \Sigma V^\top += V \,\mathrm{diag}(\sigma_1^2,\dots,\sigma_p^2)\, V^\top . +``` + +Adding the ridge term gives + +```math +X^\top X + \lambda I += +V \,\mathrm{diag}(\sigma_1^2+\lambda,\dots,\sigma_p^2+\lambda)\, V^\top . +``` + +```math +\kappa_2(X^\top X+\lambda I) += +\frac{\sigma_{\max}^2+\lambda}{\sigma_{\min}^2+\lambda}. +``` + +Because the performance of numerical algorithms is strongly influenced by the conditioning of the system they solve, the ridge penalty effectively creates regression problems with different numerical difficulty. This provides a way to assess how algorithm performance, convergence behavior, and computational cost depend on the numerical stability of the problem. In this experiment, the magnitude of $\lambda$ is selected relative to the smallest and largest singular values of $X$. A weak regularization regime corresponds to $\lambda \approx \sigma_{\min}^2$, where the ridge penalty begins to influence the smallest singular directions but the system remains moderately ill-conditioned. A moderate regularization regime corresponds to $\lambda \approx \sigma_{\min}\sigma_{\max}$, which substantially improves the conditioning of the problem by increasing the smallest eigenvalues of $X^\top X + \lambda I$. Finally, a strong regularization regime corresponds to $\lambda \approx \sigma_{\max}^2$, where the ridge penalty dominates the spectral scale of the problem and produces a well-conditioned system. + +Another blocking factor that will be considered is how sparse or dense the matrix $X$ is. Many algorithms behave differently depending on whether the matrix is sparse or dense. In ridge regression, there are many operations involving $X$ including matrix-matrix products and matrix-vector products. A dense matrix leads to high computational cost whereas a sparse matrix we can significantly reduce the cost. As such, different algorithms may perform better depending on the sparsity structure of X, making matrix sparsity a relevant blocking factor when comparing algorithm behavior and computational efficiency. + +The total number of block combinations is determined by the product of the number of levels in each blocking factor, denoted b. For example, if the experiment includes three dimensional regimes, two sparsity levels, and two regularization strengths, then there are $3 * 2 * 2 = 12$ block combinations. We will also denote r to be the number of replicated datasets in each block. Here, we mean the number datasets within a block. The total number of experimental units is then ${b * r}$. + +| Blocking System | Factor | Blocks | +|:----------------|:-------|:-------| +| Dataset | Dimensional regime | $(p \ll n)$, $(p \approx n)$, $(p \gg n)$| +| Ridge Penalty | Magnitude of ${\lambda}$ relative to the spectral scale of $X^\top X$ | Weak ($\lambda \approx \sigma_{\min}^2$), Moderate ($\lambda \approx \sigma_{\min}\sigma_{\max}$), Strong ($\lambda \approx \sigma_{\max}^2$), where $\sigma_{\min}$ and $\sigma_{\max}$ denote the smallest and largest singular values of $X$. | +| Matrix Sparsity| Density of non-zero values in $X$ | Sparse (< 10% non-zero), Moderate (10%-50% non-zero), Dense (> 50% non-zero)| +# Treatments + +The treatments are the ridge regression solution methods: + +- Gradient-based optimization +- Stochastic gradient descent +- Direct Methods + - Golub Kahan Bidiagonalization + + Since each experimental unit will recieves all t treatments, the total number of algorithm runs in the experiment is ${t * b * r}$. For this experiment, ${t=3}$. To ensure fair comparison between algorithms, each treatment will be applied under a fixed time constraint. Each algorithm will be run for a maximum of two hours per experimental unit. +# Observational Units and Measurements + +The observational units are each algorithm-dataset pair. For each combination we will observe the following + +| Column Name | Data Type | Description | +|:---|:---|:---| +| `dataset_id` | Positive Integer | Identifier for the generated dataset (experimental unit). | +| `dimensional_regime` | String | Relationship between predictors and observations: `p << n`, `p ≈ n`, or `p >> n`. | +| `sparsity_level` | String | Density of the matrix `X`: `Sparse`, `Moderate`, or `Dense`. | +| `lambda_level` | String | Relative magnitude of the ridge penalty parameter `λ`: `Weak`, `Moderate`, or `Strong`. | +| `algorithm` | String | Ridge regression solution method used: `GradientDescent`, `SGD`, or `DirectMethod`. | +| `runtime_seconds` | Positive Floating-point | Time required for the algorithm to compute a solution. | +| `iterations` | Positive Integer | Number of iterations performed by the algorithm (`NA` for direct methods). | + + +The collected measurements will be written to a CSV file. Each row in the file corresponds to a single algorithm–dataset pair, which forms the observational unit of the experiment. The columns represent the recorded measurements. After the experiment, the resulting CSV file should contain ${Algorithms∗Datasets}$ number of rows and each row will contain exactly seven columns. \ No newline at end of file diff --git a/src/RidgeRegression.jl b/src/RidgeRegression.jl index c32de91..79d85b2 100644 --- a/src/RidgeRegression.jl +++ b/src/RidgeRegression.jl @@ -1,5 +1,10 @@ module RidgeRegression +using CSV +using DataFrames +using LinearAlgebra -# Write your package code here. +include("bidiagonalization.jl") + +export compute_givens, rotate_rows!, rotate_cols!, bidiagonalize_with_H, apply_Ht_to_b end diff --git a/src/bidiagonalization.jl b/src/bidiagonalization.jl new file mode 100644 index 0000000..eb2bdc6 --- /dev/null +++ b/src/bidiagonalization.jl @@ -0,0 +1,165 @@ +""" + compute_givens(a, b) + +Compute Givens rotation coefficients for scalars `a` and `b`. + +# Arguments +- `a::Number`: First scalar +- `b::Number`: Second scalar + +# Returns +- `c::Number`: Cosine coefficient of the Givens rotation +- `s::Number`: Sine coefficient of the Givens rotation + +""" +function compute_givens(a::Number, b::Number) # Compute Givens rotation coefficients for scalars a and b + if b == 0 + return one(a), zero(a) + elseif a == 0 + throw(ArgumentError("a is zero, cannot compute Givens rotation")) + else + r = hypot(a, b) + c = a / r + s = b / r + return c, s + end +end + +""" + rotate_rows!(M::AbstractMatrix,i::Int,j::Int,c::Number,s::Number) + +Apply a Givens rotation to rows `i` and `j` of matrix `M`. + +# Arguments +- `M::AbstractMatrix`: The matrix to be rotated +- `i::Int`: First row index +- `j::Int`: Second row index +- `c::Number`: Cosine coefficient of the Givens rotation +- `s::Number`: Sine coefficient of the Givens rotation + +# Returns +- `M::AbstractMatrix`: The rotated matrix + +""" +function rotate_rows!(M::AbstractMatrix,i::Int,j::Int,c::Number,s::Number) + for k in 1:size(M,2) # Loop over columns + temp = M[i,k] # Store the original value of M[i,k] before modification + M[i,k] = c*temp + s*M[j,k] + M[j,k] = -conj(s)*temp + c*M[j,k] #Apply the Givens rotation to the elements in rows i and j + end + return M +end + + +""" + rotate_cols!(M::AbstractMatrix,i::Int,j::Int,c::Number,s::Number) + +Apply a Givens rotation to columns `i` and `j` of matrix `M`. + +# Arguments +- `M::AbstractMatrix`: The matrix to be rotated +- `i::Int`: First column index +- `j::Int`: Second column index +- `c::Number`: Cosine coefficient of the Givens rotation +- `s::Number`: Sine coefficient of the Givens rotation + +# Returns +- `M::AbstractMatrix`: The rotated matrix + +""" +function rotate_cols!(M::AbstractMatrix,i::Int,j::Int,c::Number,s::Number) + for k in 1:size(M,1) # Loop over rows + temp = M[k,i] # Store the original value of M[k,i] before modification + M[k,i] = c*temp + s*M[k,j] + M[k,j] = -conj(s)*temp + c*M[k,j] # Apply the Givens rotation to the elements in columns i and j + end + return M +end +""" + bidiagonalize_with_H(A::AbstractMatrix, L::AbstractMatrix, b::AbstractVector) + +Performs bidiagonalization of a matrix A using a sequence of Givens transformations while explicitly accumulating +the orthogonal left factor `H` and right factor `K` such that + + H' * A * K = B. + +# Arguments +- `A::AbstractMatrix`: The matrix to be bidiagonalized +- `L::AbstractMatrix`: The banded matrix to be updated in-place +- `b::AbstractVector`: The vector to be transformed + +# Returns +- `B::AbstractMatrix`: The bidiagonalized form of the input matrix A with dimension (n,n) +- `C::AbstractMatrix`: The matrix resulting from the sequence of Givens transformations +- `H::AbstractMatrix`: The orthogonal left factor +- `K::AbstractMatrix`: The orthogonal right factor +- `Ht::AbstractMatrix`: The transpose of the orthogonal left factor +- `b::AbstractVector`: The vector resulting from applying the Givens transformations to the input vector b + +""" + +function bidiagonalize_with_H(A::AbstractMatrix, L::AbstractMatrix, b::AbstractVector) + m, n = size(A) + + B = copy(A) #Will be transformed into bidiagonal form + C = copy(L) + bhat = copy(b) #Will recieve same left transformations applied to A + + Ht = Matrix{eltype(A)}(I, m, m) #Ht will accumulate the left transformations, initialized as identity + K = Matrix{eltype(A)}(I, n, n) #K will accumulate the right transformations, initialized as identity + + imax = min(m, n) + + for i in 1:imax + # Left Givens rotations: zero below diagonal in column i + for j in m:-1:(i + 1) + if B[j, i] != 0 + c, s = compute_givens(B[i, i], B[j, i]) #Build Givens rotation to zero B[j, i] + rotate_rows!(B, i, j, c, s) #Apply the Givens rotation to rows i and j of B + rotate_rows!(Ht, i, j, c, s) #Accumulate the left transformations in Ht + B[j, i] = zero(eltype(B)) + end + end + + # Right Givens rotations: zero entries right of superdiagonal + if i <= n - 2 + for k in n:-1:(i + 2) + if B[i, k] != 0 + c, s = compute_givens(B[i, i + 1], B[i, k]) #Build Givens rotation to zero B[i, k] + #s = -s + rotate_cols!(B, i + 1, k, c, s) #Apply the Givens rotation to columns i+1 and k of B + rotate_cols!(C, i + 1, k, c, s) #Apply the same right rotation to C, since C is updated by the right transformations + rotate_cols!(K, i + 1, k, c, s) #Accumulate the right transformations in K + B[i, k] = zero(eltype(B)) + end + end + end + end + + H = transpose(Ht) + bhat = apply_Ht_to_b(Ht, b) + return B, C, H, K, Ht, bhat +end + +""" + apply_Ht_to_b(Ht::AbstractMatrix, b::AbstractVector) + +Apply the accumulated left orthogonal transformation `H'` (stored as `Ht`) +to the constant vector `b`. + +# Arguments +- `Ht::AbstractMatrix`: The transpose of the orthogonal left factor `H`. +- `b::AbstractVector`: The vector to be transformed. + +# Returns +- `bhat::AbstractVector`: The transformed vector satisfying `bhat = Ht * b`. + +# Throws +- `DimensionMismatch`: If the number of columns of `Ht` does not match the length of `b`. +""" +function apply_Ht_to_b(Ht::AbstractMatrix, b::AbstractVector) + size(Ht, 2) == length(b) || throw(DimensionMismatch( + "Ht has $(size(Ht, 2)) columns but b has length $(length(b))" + )) + return Ht * b +end diff --git a/test/Project.toml b/test/Project.toml index 0c36332..73141b0 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,2 +1,5 @@ [deps] +CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" \ No newline at end of file diff --git a/test/apply_Ht_to_b_test.jl b/test/apply_Ht_to_b_test.jl new file mode 100644 index 0000000..9e11e83 --- /dev/null +++ b/test/apply_Ht_to_b_test.jl @@ -0,0 +1,34 @@ +@testset "apply_Ht_to_b returns original vector when Ht is identity" begin + Ht = Matrix{Float64}(I, 3, 3) + b = [1.0, 2.0, 3.0] + + @test apply_Ht_to_b(Ht, b) == b +end + +@testset "apply_Ht_to_b correctly applies H^T to vector" begin + Ht =[1.0 0.0 0.0; + 0.0 0.0 1.0; + 0.0 1.0 0.0] + + b = [4.0, 5.0, 6.0] + + @test apply_Ht_to_b(Ht, b) == [4.0, 6.0, 5.0] +end + +@testset "apply_Ht_to_b correctly applies Givens rotation to vector" begin + c, s = 3/5, 4/5 + + Ht =[c s; + -s c] + + b = [5.0, 0.0] + + @test apply_Ht_to_b(Ht, b) ≈ [3.0, -4.0] +end + +@testset "pply_Ht_to_b throws DimensionMismatch" begin + Ht = Matrix{Float64}(I, 3, 3) + b = [1.0, 2.0] + + @test_throws DimensionMismatch apply_Ht_to_b(Ht, b) +end diff --git a/test/bidiagonalize_with_H_tests.jl b/test/bidiagonalize_with_H_tests.jl new file mode 100644 index 0000000..0865b9d --- /dev/null +++ b/test/bidiagonalize_with_H_tests.jl @@ -0,0 +1,101 @@ +@testset "bidiagonalize_with_H preserves orthogonality and factorization identities" begin + A = [1.0 2.0 3.0; + 4.0 5.0 6.0; + 7.0 8.0 10.0] + + L = Matrix{Float64}(I, 3, 3) + b = [1.0, 2.0, 3.0] + + B, C, H, K, Ht, bhat = bidiagonalize_with_H(A, L, b) + + # Ht should be the transpose of H + @test Ht ≈ H' + + # Orthogonality of H and K + @test H' * H ≈ Matrix{Float64}(I, 3, 3) + @test H * H' ≈ Matrix{Float64}(I, 3, 3) + @test K' * K ≈ Matrix{Float64}(I, 3, 3) + @test K * K' ≈ Matrix{Float64}(I, 3, 3) + + # Main factorization identity + @test H' * A * K ≈ B + + @test H' * b ≈ bhat + + # Applies right transforms to L, implicitly assuming J = I + @test L * K ≈ C +end + +@testset "bidiagonalize_with_H produces upper bidiagonal matrix" begin + A = [2.0 1.0 3.0; + 4.0 5.0 6.0; + 7.0 8.0 9.0] + + L = Matrix{Float64}(I, 3, 3) + b = [1.0, -1.0, 2.0] + + B, C, H, K, Ht, bhat = bidiagonalize_with_H(A, L, b) + + m, n = size(B) + + for i in 1:m + for j in 1:n + # In an upper bidiagonal matrix, only diagonal and superdiagonal may be nonzero + if !(j == i || j == i + 1) + @test isapprox(B[i, j], 0.0; atol=1e-10, rtol=0) + end + end + end +end + +@testset "bidiagonalize_with_H handles rectangular matrices" begin + A = [1.0 2.0 3.0; + 4.0 5.0 6.0; + 7.0 8.0 9.0; + 1.0 0.0 1.0] + + L = Matrix{Float64}(I, 3, 3) + b = [1.0, 2.0, 3.0, 4.0] + + B, C, H, K, Ht, bhat = bidiagonalize_with_H(A, L, b) + + m, n = size(A) + + @test size(B) == (m, n) + @test size(C) == size(L) + @test size(H) == (m, m) + @test size(K) == (n, n) + @test size(Ht) == (m, m) + @test length(bhat) == length(b) + + @test H' * H ≈ Matrix{Float64}(I, m, m) + @test K' * K ≈ Matrix{Float64}(I, n, n) + + @test H' * A * K ≈ B + @test H' * b ≈ bhat + @test L * K ≈ C + + for i in 1:m + for j in 1:n + if !(j == i || j == i + 1) + @test isapprox(B[i, j], 0.0; atol=1e-10, rtol=0) + end + end + end +end + +@testset "bidiagonalize_with_H correctly zeroes subdiagonal entry in small matrix" begin + A = [3.0 0.0; + 4.0 5.0] + + L = Matrix{Float64}(I, 2, 2) + b = [1.0, 2.0] + + B, C, H, K, Ht, bhat = bidiagonalize_with_H(A, L, b) + + @test H' * A * K ≈ B + @test H' * b ≈ bhat + @test L * K ≈ C + + @test isapprox(B[2,1], 0.0; atol=1e-12, rtol=0) +end diff --git a/test/compute_givens_test.jl b/test/compute_givens_test.jl new file mode 100644 index 0000000..c3d503f --- /dev/null +++ b/test/compute_givens_test.jl @@ -0,0 +1,13 @@ +@testset "Testset 1" begin + c, s = compute_givens(3.0, 0.0) + @test c == 1.0 + @test s == 0.0 + a = 3.0 + b = 4.0 + c, s = compute_givens(a, b) + v1 = c*a + s*b + v2 = -s*a + c*b + @test isapprox(v2, 0.0; atol=1e-12, rtol=0) + @test isapprox(abs(v1), hypot(a, b); atol=1e-12, rtol=0) + @test_throws ArgumentError compute_givens(0.0, 2.0) +end diff --git a/test/rotate_cols_test.jl b/test/rotate_cols_test.jl new file mode 100644 index 0000000..cf8eb7b --- /dev/null +++ b/test/rotate_cols_test.jl @@ -0,0 +1,9 @@ +@testset "rotate_cols! zeros second column entry using Givens rotation" begin + M = [3.0 4.0; + 1.0 2.0] + + c, s = compute_givens(3.0, 4.0) + rotate_cols!(M, 1, 2, c, s) + + @test isapprox(M[1, 2], 0.0; atol=1e-12, rtol=0) +end diff --git a/test/rotate_rows_test.jl b/test/rotate_rows_test.jl new file mode 100644 index 0000000..9e41fb7 --- /dev/null +++ b/test/rotate_rows_test.jl @@ -0,0 +1,9 @@ +@testset "rotate_rows! correctly applies Givens rotation to rows" begin + M = [1.0 2.0; + 3.0 4.0] + + c, s = compute_givens(1.0, 3.0) + rotate_rows!(M, 1, 2, c, s) + + @test isapprox(M[2, 1], 0.0; atol=1e-12, rtol=0) +end diff --git a/test/runtests.jl b/test/runtests.jl index dbbe06f..9362c6f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,6 +1,25 @@ using RidgeRegression using Test +using LinearAlgebra @testset "RidgeRegression.jl" begin - # Write your tests here. + @testset "Compute Givens Rotations Tests" begin + include("compute_givens_test.jl") + end + + @testset "Rotate Columns Tests" begin + include("rotate_cols_test.jl") + end + + @testset "Rotate Rows Tests" begin + include("rotate_rows_test.jl") + end + + @testset "Bidiagonalization Tests" begin + include("bidiagonalize_with_H_tests.jl") + end + + @testset "Applying Ht to b Tests" begin + include("apply_Ht_to_b_test.jl") + end end