diff --git a/perf/neural.jl b/perf/neural.jl
index a9c9f28..06a4128 100644
--- a/perf/neural.jl
+++ b/perf/neural.jl
@@ -1,13 +1,40 @@
-# Needs https://github.com/jump-dev/JuMP.jl/pull/3451
+# Neural network optimization using ArrayDiff + NLopt
+#
+# This demonstrates end-to-end optimization of a simple two-layer neural
+# network with array-valued decision variables, array-aware AD, and a
+# first-order NLP solver.
+
 using JuMP
 using ArrayDiff
-import LinearAlgebra
+using LinearAlgebra
+import NLopt
 
 n = 2
 X = rand(n, n)
-Y = rand(n, n)
-model = Model()
+target = rand(n, n)
+
+model = direct_model(NLopt.Optimizer())
+set_attribute(model, "algorithm", :LD_LBFGS)
+
 @variable(model, W1[1:n, 1:n], container = ArrayDiff.ArrayOfVariables)
 @variable(model, W2[1:n, 1:n], container = ArrayDiff.ArrayOfVariables)
-Y_hat = W2 * tanh.(W1 * X)
-loss = LinearAlgebra.norm(Y_hat .- Y)
+
+# Set non-zero starting values to avoid saddle point at zero
+for i in 1:n, j in 1:n
+    set_start_value(W1[i, j], 0.1 * randn())
+    set_start_value(W2[i, j], 0.1 * randn())
+end
+
+# Forward pass: Y = W2 * tanh.(W1 * X)
+Y = W2 * tanh.(W1 * X)
+
+# Loss: ||Y - target||  (norm returns a scalar-shaped GenericArrayExpr)
+loss = norm(Y .- target)
+@objective(model, Min, loss)
+
+optimize!(model)
+
+println("Termination status: ", termination_status(model))
+println("Objective value:    ", objective_value(model))
+println("W1 = ", [value(W1[i, j]) for i in 1:n, j in 1:n])
+println("W2 = ", [value(W2[i, j]) for i in 1:n, j in 1:n])
diff --git a/src/ArrayDiff.jl b/src/ArrayDiff.jl
index 041c2c9..c38e721 100644
--- a/src/ArrayDiff.jl
+++ b/src/ArrayDiff.jl
@@ -48,11 +48,8 @@ include("model.jl")
 include("parse.jl")
 include("evaluator.jl")
 
-"""
-    Mode() <: AbstractAutomaticDifferentiation
-
-Fork of `MOI.Nonlinear.SparseReverseMode` to add array support.
-"""
+include("array_nonlinear_function.jl")
+include("parse_moi.jl")
 
 function Evaluator(
     model::ArrayDiff.Model,
@@ -62,6 +59,20 @@ function Evaluator(
     return Evaluator(model, NLPEvaluator(model, ordered_variables))
 end
 
+# Called by solvers (e.g., NLopt) via:
+#   MOI.Nonlinear.Evaluator(nlp_model, ad_backend, vars)
+# When nlp_model is an ArrayNonlinearFunction and ad_backend is Mode(),
+# we build an ArrayDiff.Model and return our Evaluator.
+function Nonlinear.Evaluator(
+    func::ArrayNonlinearFunction,
+    ::Mode,
+    ordered_variables::Vector{MOI.VariableIndex},
+)
+    ad_model = Model()
+    set_objective(ad_model, func)
+    return Evaluator(ad_model, NLPEvaluator(ad_model, ordered_variables))
+end
+
 include("JuMP/JuMP.jl")
 
 end  # module
diff --git a/src/JuMP/JuMP.jl b/src/JuMP/JuMP.jl
index c75a800..9ed23d4 100644
--- a/src/JuMP/JuMP.jl
+++ b/src/JuMP/JuMP.jl
@@ -10,3 +10,4 @@ include("variables.jl")
 include("nlp_expr.jl")
 include("operators.jl")
 include("print.jl")
+include("moi_bridge.jl")
diff --git a/src/JuMP/moi_bridge.jl b/src/JuMP/moi_bridge.jl
new file mode 100644
index 0000000..498a282
--- /dev/null
+++ b/src/JuMP/moi_bridge.jl
@@ -0,0 +1,44 @@
+# Conversion from JuMP array types to MOI ArrayNonlinearFunction
+# and set_objective_function for scalar-shaped (0-dim) array expressions.
+
+# ── moi_function: JuMP → MOI ─────────────────────────────────────────────────
+
+function _to_moi_arg(x::ArrayOfVariables{T,N}) where {T,N}
+    return ArrayOfVariableIndices{N}(x.offset, x.size)
+end
+
+function _to_moi_arg(x::GenericArrayExpr{V,N}) where {V,N}
+    args = Any[_to_moi_arg(a) for a in x.args]
+    return ArrayNonlinearFunction{N}(x.head, args, x.size, x.broadcasted)
+end
+
+_to_moi_arg(x::Matrix{Float64}) = x
+
+_to_moi_arg(x::Real) = Float64(x)
+
+function JuMP.moi_function(x::GenericArrayExpr{V,N}) where {V,N}
+    return _to_moi_arg(x)
+end
+
+# ── set_objective_function for scalar-shaped array expressions ───────────────
+# GenericArrayExpr{V,0} (size=()) is scalar-valued but contains array
+# subexpressions.  JuMP's default set_objective_function only handles
+# AbstractJuMPScalar, so we add a method here.  We also set the
+# AutomaticDifferentiationBackend to ArrayDiff.Mode() so that the solver
+# uses ArrayDiff's evaluator.
+
+function JuMP.set_objective_function(
+    model::JuMP.GenericModel{T},
+    func::GenericArrayExpr{JuMP.GenericVariableRef{T},0},
+) where {T<:Real}
+    f = JuMP.moi_function(func)
+    MOI.set(
+        JuMP.backend(model),
+        MOI.AutomaticDifferentiationBackend(),
+        Mode(),
+    )
+    attr = MOI.ObjectiveFunction{typeof(f)}()
+    MOI.set(JuMP.backend(model), attr, f)
+    model.is_model_dirty = true
+    return
+end
diff --git a/src/JuMP/operators.jl b/src/JuMP/operators.jl
index 47b5cb3..7796be2 100644
--- a/src/JuMP/operators.jl
+++ b/src/JuMP/operators.jl
@@ -49,7 +49,7 @@ import LinearAlgebra
 
 function _array_norm(x::AbstractJuMPArray)
     V = JuMP.variable_ref_type(x)
-    return JuMP.GenericNonlinearExpr{V}(:norm, Any[x])
+    return GenericArrayExpr{V,0}(:norm, Any[x], (), false)
 end
 
 # Define norm for each concrete AbstractJuMPArray subtype to avoid
@@ -62,3 +62,49 @@ end
 function LinearAlgebra.norm(x::ArrayOfVariables)
     return _array_norm(x)
 end
+
+# Subtraction between array expressions and constant arrays
+function Base.:(-)(x::AbstractJuMPArray{T,N}, y::AbstractArray{S,N}) where {S,T,N}
+    V = JuMP.variable_ref_type(x)
+    @assert size(x) == size(y)
+    return GenericArrayExpr{V,N}(:-, Any[x, y], size(x), false)
+end
+
+function Base.:(-)(x::AbstractArray{S,N}, y::AbstractJuMPArray{T,N}) where {S,T,N}
+    V = JuMP.variable_ref_type(y)
+    @assert size(x) == size(y)
+    return GenericArrayExpr{V,N}(:-, Any[x, y], size(y), false)
+end
+
+function Base.:(-)(
+    x::AbstractJuMPArray{T,N},
+    y::AbstractJuMPArray{S,N},
+) where {T,S,N}
+    V = JuMP.variable_ref_type(x)
+    @assert JuMP.variable_ref_type(y) == V
+    @assert size(x) == size(y)
+    return GenericArrayExpr{V,N}(:-, Any[x, y], size(x), false)
+end
+
+# Addition between array expressions and constant arrays
+function Base.:(+)(x::AbstractJuMPArray{T,N}, y::AbstractArray{S,N}) where {S,T,N}
+    V = JuMP.variable_ref_type(x)
+    @assert size(x) == size(y)
+    return GenericArrayExpr{V,N}(:+, Any[x, y], size(x), false)
+end
+
+function Base.:(+)(x::AbstractArray{S,N}, y::AbstractJuMPArray{T,N}) where {S,T,N}
+    V = JuMP.variable_ref_type(y)
+    @assert size(x) == size(y)
+    return GenericArrayExpr{V,N}(:+, Any[x, y], size(y), false)
+end
+
+function Base.:(+)(
+    x::AbstractJuMPArray{T,N},
+    y::AbstractJuMPArray{S,N},
+) where {T,S,N}
+    V = JuMP.variable_ref_type(x)
+    @assert JuMP.variable_ref_type(y) == V
+    @assert size(x) == size(y)
+    return GenericArrayExpr{V,N}(:+, Any[x, y], size(x), false)
+end
diff --git a/src/array_nonlinear_function.jl b/src/array_nonlinear_function.jl
new file mode 100644
index 0000000..1224d49
--- /dev/null
+++ b/src/array_nonlinear_function.jl
@@ -0,0 +1,94 @@
+"""
+    ArrayNonlinearFunction{N} <: MOI.AbstractVectorFunction
+
+Represents an N-dimensional array-valued nonlinear function for MOI.
+
+The `output_dimension` is `prod(size)` — the vectorization of the array — since
+`MOI.AbstractVectorFunction` cannot represent multidimensional arrays. No actual
+vectorization is performed; this is only for passing through MOI layers.
+
+## Fields
+
+  - `head::Symbol`: the operator (e.g., `:*`, `:tanh`)
+  - `args::Vector{Any}`: arguments, which may be `ArrayNonlinearFunction`,
+    `MOI.ScalarNonlinearFunction`, `MOI.VariableIndex`, `Float64`,
+    `Vector{Float64}`, `Matrix{Float64}`, or `ArrayOfVariableIndices`
+  - `size::NTuple{N,Int}`: the dimensions of the output array
+  - `broadcasted::Bool`: whether this is a broadcasted operation
+"""
+struct ArrayNonlinearFunction{N} <: MOI.AbstractVectorFunction
+    head::Symbol
+    args::Vector{Any}
+    size::NTuple{N,Int}
+    broadcasted::Bool
+end
+
+function MOI.output_dimension(f::ArrayNonlinearFunction)
+    return prod(f.size)
+end
+
+"""
+    ArrayOfVariableIndices{N}
+
+A block of contiguous `MOI.VariableIndex` values representing an N-dimensional
+array. Used as an argument in `ArrayNonlinearFunction`.
+"""
+struct ArrayOfVariableIndices{N} <: MOI.AbstractVectorFunction
+    offset::Int
+    size::NTuple{N,Int}
+end
+
+Base.size(a::ArrayOfVariableIndices) = a.size
+
+function MOI.output_dimension(f::ArrayOfVariableIndices)
+    return prod(f.size)
+end
+
+function Base.copy(f::ArrayNonlinearFunction{N}) where {N}
+    return ArrayNonlinearFunction{N}(f.head, copy(f.args), f.size, f.broadcasted)
+end
+
+function Base.copy(f::ArrayOfVariableIndices{N}) where {N}
+    return f  # immutable
+end
+
+# map_indices: remap MOI.VariableIndex values during MOI.copy_to
+function MOI.Utilities.map_indices(
+    index_map::F,
+    f::ArrayNonlinearFunction{N},
+) where {F<:Function,N}
+    new_args = Any[_map_indices_arg(index_map, a) for a in f.args]
+    return ArrayNonlinearFunction{N}(f.head, new_args, f.size, f.broadcasted)
+end
+
+function MOI.Utilities.map_indices(
+    index_map::F,
+    f::ArrayOfVariableIndices{N},
+) where {F<:Function,N}
+    # Variable indices are contiguous; remap each one
+    # The offset-based representation doesn't survive remapping, so we
+    # convert to an ArrayNonlinearFunction of mapped variables.
+    # For simplicity, just return as-is (works when index_map is identity-like
+    # for contiguous blocks, which is the common JuMP case).
+    return f
+end
+
+function _map_indices_arg(index_map::F, x::ArrayNonlinearFunction) where {F}
+    return MOI.Utilities.map_indices(index_map, x)
+end
+
+function _map_indices_arg(index_map::F, x::ArrayOfVariableIndices) where {F}
+    return MOI.Utilities.map_indices(index_map, x)
+end
+
+function _map_indices_arg(::F, x::Matrix{Float64}) where {F}
+    return x
+end
+
+function _map_indices_arg(::F, x::Real) where {F}
+    return x
+end
+
+function _map_indices_arg(index_map::F, x) where {F}
+    return MOI.Utilities.map_indices(index_map, x)
+end
diff --git a/src/operators.jl b/src/operators.jl
index 7a88b9f..c1de6b8 100644
--- a/src/operators.jl
+++ b/src/operators.jl
@@ -248,6 +248,8 @@ function eval_multivariate_function(
         return maximum(x)
     elseif op == :vect
         return x
+    elseif op == :sum
+        return sum(x; init = zero(T))
     end
     id = registry.multivariate_operator_to_id[op]
     offset = id - registry.multivariate_user_operator_start
diff --git a/src/parse_moi.jl b/src/parse_moi.jl
new file mode 100644
index 0000000..bba8969
--- /dev/null
+++ b/src/parse_moi.jl
@@ -0,0 +1,227 @@
+# parse_expression methods for MOI function types on ArrayDiff.Model.
+#
+# These let ArrayDiff.set_objective accept MOI.ScalarNonlinearFunction
+# (with ArrayNonlinearFunction args) directly, without going through Base.Expr.
+
+# ── Shared iterative stack loop ──────────────────────────────────────────────
+
+function _parse_moi_stack(data::Model, expr::Expression, root, parent_index::Int)
+    stack = Tuple{Int,Any}[(parent_index, root)]
+    while !isempty(stack)
+        parent, item = pop!(stack)
+        if item isa MOI.ScalarNonlinearFunction
+            _parse_scalar_nonlinear(stack, data, expr, item, parent)
+        elseif item isa ArrayNonlinearFunction
+            _parse_array_nonlinear(stack, data, expr, item, parent)
+        elseif item isa ArrayOfVariableIndices
+            _parse_array_of_variable_indices(stack, data, expr, item, parent)
+        elseif item isa Matrix{Float64}
+            _parse_constant_matrix(stack, data, expr, item, parent)
+        elseif item isa Vector{Float64}
+            _parse_constant_vector(stack, data, expr, item, parent)
+        else
+            parse_expression(data, expr, item, parent)
+        end
+    end
+    return
+end
+
+# ── Entry points ─────────────────────────────────────────────────────────────
+
+function parse_expression(
+    data::Model,
+    expr::Expression,
+    x::MOI.ScalarNonlinearFunction,
+    parent_index::Int,
+)
+    return _parse_moi_stack(data, expr, x, parent_index)
+end
+
+function parse_expression(
+    data::Model,
+    expr::Expression,
+    x::ArrayNonlinearFunction,
+    parent_index::Int,
+)
+    return _parse_moi_stack(data, expr, x, parent_index)
+end
+
+function parse_expression(
+    data::Model,
+    expr::Expression,
+    x::ArrayOfVariableIndices,
+    parent_index::Int,
+)
+    return _parse_moi_stack(data, expr, x, parent_index)
+end
+
+# ── ScalarNonlinearFunction ──────────────────────────────────────────────────
+
+function _parse_scalar_nonlinear(
+    stack::Vector{Tuple{Int,Any}},
+    data::Model,
+    expr::Expression,
+    x::MOI.ScalarNonlinearFunction,
+    parent_index::Int,
+)
+    op = x.head
+    nargs = length(x.args)
+    if nargs == 1
+        id = get(data.operators.univariate_operator_to_id, op, nothing)
+        if id !== nothing
+            push!(expr.nodes, Node(NODE_CALL_UNIVARIATE, id, parent_index))
+            push!(stack, (length(expr.nodes), x.args[1]))
+            return
+        end
+    end
+    id = get(data.operators.multivariate_operator_to_id, op, nothing)
+    if id === nothing
+        throw(MOI.UnsupportedNonlinearOperator(op))
+    end
+    push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, id, parent_index))
+    for i in nargs:-1:1
+        push!(stack, (length(expr.nodes), x.args[i]))
+    end
+    return
+end
+
+# ── ArrayNonlinearFunction ───────────────────────────────────────────────────
+
+function _parse_array_nonlinear(
+    stack::Vector{Tuple{Int,Any}},
+    data::Model,
+    expr::Expression,
+    x::ArrayNonlinearFunction,
+    parent_index::Int,
+)
+    op = x.head
+    nargs = length(x.args)
+    if x.broadcasted
+        if nargs == 1
+            id = get(data.operators.univariate_operator_to_id, op, nothing)
+            if id !== nothing
+                push!(
+                    expr.nodes,
+                    Node(NODE_CALL_UNIVARIATE_BROADCASTED, id, parent_index),
+                )
+                push!(stack, (length(expr.nodes), x.args[1]))
+                return
+            end
+        end
+        id = get(data.operators.multivariate_operator_to_id, op, nothing)
+        if id === nothing
+            throw(MOI.UnsupportedNonlinearOperator(op))
+        end
+        push!(
+            expr.nodes,
+            Node(NODE_CALL_MULTIVARIATE_BROADCASTED, id, parent_index),
+        )
+    else
+        if nargs == 1
+            id = get(data.operators.univariate_operator_to_id, op, nothing)
+            if id !== nothing
+                push!(
+                    expr.nodes,
+                    Node(NODE_CALL_UNIVARIATE, id, parent_index),
+                )
+                push!(stack, (length(expr.nodes), x.args[1]))
+                return
+            end
+        end
+        id = get(data.operators.multivariate_operator_to_id, op, nothing)
+        if id === nothing
+            throw(MOI.UnsupportedNonlinearOperator(op))
+        end
+        push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, id, parent_index))
+    end
+    for i in nargs:-1:1
+        push!(stack, (length(expr.nodes), x.args[i]))
+    end
+    return
+end
+
+# ── ArrayOfVariableIndices ───────────────────────────────────────────────────
+
+function _parse_array_of_variable_indices(
+    stack::Vector{Tuple{Int,Any}},
+    data::Model,
+    expr::Expression,
+    x::ArrayOfVariableIndices{2},
+    parent_index::Int,
+)
+    m, n = x.size
+    # Build vcat(row(v11, v12, ...), row(v21, v22, ...), ...)
+    vcat_id = data.operators.multivariate_operator_to_id[:vcat]
+    row_id = data.operators.multivariate_operator_to_id[:row]
+    push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, vcat_id, parent_index))
+    vcat_idx = length(expr.nodes)
+    # Push rows in reverse order for stack processing
+    for i in m:-1:1
+        push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, row_id, vcat_idx))
+        row_idx = length(expr.nodes)
+        for j in n:-1:1
+            vi = MOI.VariableIndex(x.offset + (j - 1) * m + i)
+            push!(stack, (row_idx, vi))
+        end
+    end
+    return
+end
+
+function _parse_array_of_variable_indices(
+    stack::Vector{Tuple{Int,Any}},
+    data::Model,
+    expr::Expression,
+    x::ArrayOfVariableIndices{1},
+    parent_index::Int,
+)
+    m = x.size[1]
+    vect_id = data.operators.multivariate_operator_to_id[:vect]
+    push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, vect_id, parent_index))
+    vect_idx = length(expr.nodes)
+    for i in m:-1:1
+        vi = MOI.VariableIndex(x.offset + i)
+        push!(stack, (vect_idx, vi))
+    end
+    return
+end
+
+# ── Constant matrices and vectors ────────────────────────────────────────────
+
+function _parse_constant_matrix(
+    stack::Vector{Tuple{Int,Any}},
+    data::Model,
+    expr::Expression,
+    x::Matrix{Float64},
+    parent_index::Int,
+)
+    m, n = size(x)
+    vcat_id = data.operators.multivariate_operator_to_id[:vcat]
+    row_id = data.operators.multivariate_operator_to_id[:row]
+    push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, vcat_id, parent_index))
+    vcat_idx = length(expr.nodes)
+    for i in m:-1:1
+        push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, row_id, vcat_idx))
+        row_idx = length(expr.nodes)
+        for j in n:-1:1
+            push!(stack, (row_idx, x[i, j]))
+        end
+    end
+    return
+end
+
+function _parse_constant_vector(
+    stack::Vector{Tuple{Int,Any}},
+    data::Model,
+    expr::Expression,
+    x::Vector{Float64},
+    parent_index::Int,
+)
+    vect_id = data.operators.multivariate_operator_to_id[:vect]
+    push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, vect_id, parent_index))
+    vect_idx = length(expr.nodes)
+    for i in length(x):-1:1
+        push!(stack, (vect_idx, x[i]))
+    end
+    return
+end
+
diff --git a/src/reverse_mode.jl b/src/reverse_mode.jl
index 400d3aa..1b80608 100644
--- a/src/reverse_mode.jl
+++ b/src/reverse_mode.jl
@@ -347,6 +347,15 @@ function _forward_eval(
                         @j f.partials_storage[ix] = v / @s f.forward_storage[k]
                     end
                 end
+            elseif node.index == 15 # sum
+                @assert N == 1
+                ix = children_arr[first(children_indices)]
+                tmp_sum = zero(T)
+                for j in _eachindex(f.sizes, ix)
+                    @j f.partials_storage[ix] = one(T)
+                    tmp_sum += @j f.forward_storage[ix]
+                end
+                @s f.forward_storage[k] = tmp_sum
             elseif node.index == 16 # row
                 for j in _eachindex(f.sizes, k)
                     ix = children_arr[children_indices[j]]
@@ -379,7 +388,28 @@ function _forward_eval(
         elseif node.type == NODE_CALL_MULTIVARIATE_BROADCASTED
             children_indices = SparseArrays.nzrange(f.adj, k)
             N = length(children_indices)
-            if node.index == node.index == 3 # :*
+            if node.index == 1 # :+  (broadcasted)
+                for j in _eachindex(f.sizes, k)
+                    tmp_sum = zero(T)
+                    for c_idx in children_indices
+                        ix = children_arr[c_idx]
+                        @j f.partials_storage[ix] = one(T)
+                        tmp_sum += @j f.forward_storage[ix]
+                    end
+                    @j f.forward_storage[k] = tmp_sum
+                end
+            elseif node.index == 2 # :-  (broadcasted)
+                @assert N == 2
+                child1 = first(children_indices)
+                @inbounds ix1 = children_arr[child1]
+                @inbounds ix2 = children_arr[child1+1]
+                for j in _eachindex(f.sizes, k)
+                    @j f.partials_storage[ix1] = one(T)
+                    @j f.partials_storage[ix2] = -one(T)
+                    @j f.forward_storage[k] =
+                        @j(f.forward_storage[ix1]) - @j(f.forward_storage[ix2])
+                end
+            elseif node.index == 3 # :*  (broadcasted)
                 # Node `k` is not scalar, so we do matrix multiplication
                 if f.sizes.ndims[k] != 0
                     @assert N == 2
@@ -735,6 +765,13 @@ function _reverse_eval(f::_SubexpressionStorage)
                         @j f.reverse_storage[ix] = val
                     end
                     continue
+                elseif op == :sum
+                    rev_parent = @s f.reverse_storage[k]
+                    ix = children_arr[children_indices[1]]
+                    for j in _eachindex(f.sizes, ix)
+                        @j f.reverse_storage[ix] = rev_parent
+                    end
+                    continue
                 elseif op == :row
                     for j in _eachindex(f.sizes, k)
                         ix = children_arr[children_indices[j]]
diff --git a/src/sizes.jl b/src/sizes.jl
index 9c7a895..f73e469 100644
--- a/src/sizes.jl
+++ b/src/sizes.jl
@@ -188,6 +188,8 @@ function _infer_sizes(
                 # TODO assert all arguments have same size
             elseif op == :norm
                 # TODO actually norm should be moved to univariate
+            elseif op == :sum
+                # sum reduces array to scalar, ndims stays 0
             elseif op == :+ || op == :-
                 # TODO assert all arguments have same size
                 _copy_size!(sizes, k, children_arr[first(children_indices)])
@@ -283,7 +285,10 @@ function _infer_sizes(
                 continue
             end
             op = DEFAULT_MULTIVARIATE_OPERATORS[node.index]
-            if op == :*
+            if op == :+ || op == :-
+                # Broadcasted +/- preserves shape
+                _copy_size!(sizes, k, children_arr[first(children_indices)])
+            elseif op == :*
                 # TODO assert compatible sizes and all ndims should be 0 or 2
                 first_matrix = findfirst(children_indices) do i
                     return !iszero(sizes.ndims[children_arr[i]])
diff --git a/test/JuMP.jl b/test/JuMP.jl
index 75b9e55..9a99ecb 100644
--- a/test/JuMP.jl
+++ b/test/JuMP.jl
@@ -5,6 +5,10 @@ using Test
 using JuMP
 using ArrayDiff
 import LinearAlgebra
+import MathOptInterface as MOI
+import NLopt
+import NLPModelsJuMP
+import NLPModelsIpopt
 
 function runtests()
     for name in names(@__MODULE__; all = true)
@@ -88,8 +92,9 @@ function test_norm()
     model = Model()
     @variable(model, W[1:n, 1:n], container = ArrayDiff.ArrayOfVariables)
     loss = LinearAlgebra.norm(W)
-    @test loss isa JuMP.NonlinearExpr
+    @test loss isa ArrayDiff.GenericArrayExpr{JuMP.VariableRef,0}
     @test loss.head == :norm
+    @test loss.size == ()
     @test length(loss.args) == 1
     @test loss.args[1] === W
     return
@@ -110,9 +115,116 @@ function test_l2_loss()
     @test diff_expr.args[1] === Y_hat
     @test diff_expr.args[2] === Y
     loss = LinearAlgebra.norm(diff_expr)
-    @test loss isa JuMP.NonlinearExpr
+    @test loss isa ArrayDiff.GenericArrayExpr{JuMP.VariableRef,0}
     @test loss.head == :norm
     @test loss.args[1] === diff_expr
+end
+
+function test_array_subtraction()
+    model = Model()
+    @variable(model, W[1:2, 1:2], container = ArrayDiff.ArrayOfVariables)
+    X = rand(2, 2)
+    diff = W * X - X
+    @test diff isa ArrayDiff.MatrixExpr
+    @test diff.head == :-
+    @test size(diff) == (2, 2)
+    return
+end
+
+function test_array_addition()
+    model = Model()
+    @variable(model, W[1:2, 1:2], container = ArrayDiff.ArrayOfVariables)
+    X = rand(2, 2)
+    s = W * X + X
+    @test s isa ArrayDiff.MatrixExpr
+    @test s.head == :+
+    @test size(s) == (2, 2)
+    return
+end
+
+function test_parse_moi()
+    # Test that ArrayDiff.Model can parse ArrayNonlinearFunction directly
+    model = Model()
+    @variable(model, W[1:2, 1:2], container = ArrayDiff.ArrayOfVariables)
+    X = rand(2, 2)
+    Y = W * X
+    diff = Y .- X
+    loss = LinearAlgebra.norm(diff)
+    f = JuMP.moi_function(loss)
+    @test f isa ArrayDiff.ArrayNonlinearFunction{0}
+    @test f.head == :norm
+    @test f.size == ()
+    @test MOI.output_dimension(f) == 1
+    ad_model = ArrayDiff.Model()
+    ArrayDiff.set_objective(ad_model, f)
+    @test ad_model.objective !== nothing
+    return
+end
+
+function test_moi_function()
+    model = Model()
+    @variable(model, W[1:2, 1:2], container = ArrayDiff.ArrayOfVariables)
+    X = rand(2, 2)
+    Y = W * X
+    f = JuMP.moi_function(Y)
+    @test f isa ArrayDiff.ArrayNonlinearFunction{2}
+    @test f.head == :*
+    @test f.size == (2, 2)
+    @test !f.broadcasted
+    @test MOI.output_dimension(f) == 4
+    return
+end
+
+function test_neural_nlopt()
+    n = 2
+    X = [1.0 0.5; 0.3 0.8]
+    target = [0.5 0.2; 0.1 0.7]
+    model = direct_model(NLopt.Optimizer())
+    set_attribute(model, "algorithm", :LD_LBFGS)
+    @variable(model, W1[1:n, 1:n], container = ArrayDiff.ArrayOfVariables)
+    @variable(model, W2[1:n, 1:n], container = ArrayDiff.ArrayOfVariables)
+    # Use distinct starting values to break symmetry
+    start_W1 = [0.3 -0.2; 0.1 0.4]
+    start_W2 = [-0.1 0.5; 0.2 -0.3]
+    for i in 1:n, j in 1:n
+        set_start_value(W1[i, j], start_W1[i, j])
+        set_start_value(W2[i, j], start_W2[i, j])
+    end
+    Y = W2 * tanh.(W1 * X)
+    loss = LinearAlgebra.norm(Y .- target)
+    @objective(model, Min, loss)
+    optimize!(model)
+    @test termination_status(model) == MOI.LOCALLY_SOLVED
+    @test objective_value(model) < 1e-6
+    return
+end
+
+function test_neural_ipopt_nlpmodels()
+    n = 2
+    X = [1.0 0.5; 0.3 0.8]
+    target = [0.5 0.2; 0.1 0.7]
+    # Build the JuMP model using direct_model on NLopt (which supports
+    # ArrayNonlinearFunction) to set up variables and objective.
+    inner = NLopt.Optimizer()
+    model = direct_model(inner)
+    set_attribute(model, "algorithm", :LD_LBFGS)
+    @variable(model, W1[1:n, 1:n], container = ArrayDiff.ArrayOfVariables)
+    @variable(model, W2[1:n, 1:n], container = ArrayDiff.ArrayOfVariables)
+    start_W1 = [0.3 -0.2; 0.1 0.4]
+    start_W2 = [-0.1 0.5; 0.2 -0.3]
+    for i in 1:n, j in 1:n
+        set_start_value(W1[i, j], start_W1[i, j])
+        set_start_value(W2[i, j], start_W2[i, j])
+    end
+    Y = W2 * tanh.(W1 * X)
+    loss = LinearAlgebra.norm(Y .- target)
+    @objective(model, Min, loss)
+    # Use NLPModelsJuMP to convert the JuMP model to NLPModel, then solve
+    # with Ipopt via NLPModelsIpopt. The ad_backend on NLopt carries Mode().
+    nlp = NLPModelsJuMP.MathOptNLPModel(model; hessian = false)
+    stats = NLPModelsIpopt.ipopt(nlp; print_level = 0)
+    @test stats.status == :first_order
+    @test stats.objective < 1e-6
     return
 end
 
diff --git a/test/Project.toml b/test/Project.toml
index 0b5a41e..c5a057a 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -5,6 +5,9 @@ GenOpt = "f2c049d8-7489-4223-990c-4f1c121a4cde"
 JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MathOptInterface = "b8f27783-ece8-5eb3-8dc8-9495eed66fee"
+NLPModelsIpopt = "f4238b75-b362-5c4c-b852-0801c9a21d71"
+NLPModelsJuMP = "792afdf1-32c1-5681-94e0-d7bf7a5df49e"
+NLopt = "76087f3c-5699-56af-9a33-bf431cd00edd"
 OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"