diff --git a/numpy_ml/tests/nn_torch_models.py b/numpy_ml/tests/nn_torch_models.py index a5ae3dc..dfffef2 100644 --- a/numpy_ml/tests/nn_torch_models.py +++ b/numpy_ml/tests/nn_torch_models.py @@ -4,10 +4,13 @@ import torch.nn as nn import torch.nn.functional as F -import tensorflow as tf - import numpy as np +try: + import tensorflow as tf +except ImportError: + tf = None + ####################################################################### # Gold-standard implementations for testing custom layers # # (Requires Pytorch) # @@ -1804,489 +1807,490 @@ def extract_grads(self, Q, K, V, mask=None): return grads -####################################################################### -# TF WGAN GP Gold Standard Implementation # -# adapted from: https://github.com/igul222/improved_wgan_training/ # -####################################################################### +if tf is not None: + ####################################################################### + # TF WGAN GP Gold Standard Implementation # + # adapted from: https://github.com/igul222/improved_wgan_training/ # + ####################################################################### -_params = {} -_param_aliases = {} + _params = {} + _param_aliases = {} -def param(name, *args, **kwargs): - """ - A wrapper for `tf.Variable` which enables parameter sharing in models. + def param(name, *args, **kwargs): + """ + A wrapper for `tf.Variable` which enables parameter sharing in models. - Creates and returns theano shared variables similarly to `tf.Variable`, - except if you try to create a param with the same name as a - previously-created one, `param(...)` will just return the old one instead of - making a new one. + Creates and returns theano shared variables similarly to `tf.Variable`, + except if you try to create a param with the same name as a + previously-created one, `param(...)` will just return the old one instead of + making a new one. - This constructor also adds a `param` attribute to the shared variables it - creates, so that you can easily search a graph for all params. - """ + This constructor also adds a `param` attribute to the shared variables it + creates, so that you can easily search a graph for all params. + """ + + if name not in _params: + kwargs["name"] = name + param = tf.Variable(*args, **kwargs) + param.param = True + _params[name] = param + result = _params[name] + i = 0 + while result in _param_aliases: + i += 1 + result = _param_aliases[result] + return result - if name not in _params: - kwargs["name"] = name - param = tf.Variable(*args, **kwargs) - param.param = True - _params[name] = param - result = _params[name] - i = 0 - while result in _param_aliases: - i += 1 - result = _param_aliases[result] - return result - - -def params_with_name(name): - return [p for n, p in _params.items() if name in n] - - -def ReLULayer(name, n_in, n_out, inputs, w_initialization): - if isinstance(w_initialization, np.ndarray): - weight_values = w_initialization.astype("float32") - - W = param(name + ".W", weight_values) - result = tf.matmul(inputs, W) - output = tf.nn.bias_add( - result, param(name + ".b", np.zeros((n_out,), dtype="float32")) - ) - output = tf.nn.relu(output) - return output, W - - -def LinearLayer(name, n_in, n_out, inputs, w_initialization): - if isinstance(w_initialization, np.ndarray): - weight_values = w_initialization.astype("float32") - - W = param(name + ".W", weight_values) - result = tf.matmul(inputs, W) - output = tf.nn.bias_add( - result, param(name + ".b", np.zeros((n_out,), dtype="float32")) - ) - return output, W - - -def Generator(n_samples, X_real, params=None): - n_feats = 2 - W1 = W2 = W3 = W4 = "he" - noise = tf.random.normal([n_samples, 2]) - if params is not None: - noise = tf.convert_to_tensor(params["noise"], dtype="float32") - W1 = params["generator"]["FC1"]["W"] - W2 = params["generator"]["FC2"]["W"] - W3 = params["generator"]["FC3"]["W"] - W4 = params["generator"]["FC4"]["W"] - DIM = params["g_hidden"] - n_feats = params["n_in"] - - outs = {} - weights = {} - output, W = ReLULayer("Generator.1", n_feats, DIM, noise, w_initialization=W1) - outs["FC1"] = output - weights["FC1"] = W - output, W = ReLULayer("Generator.2", DIM, DIM, output, w_initialization=W2) - outs["FC2"] = output - weights["FC2"] = W - output, W = ReLULayer("Generator.3", DIM, DIM, output, w_initialization=W3) - outs["FC3"] = output - weights["FC3"] = W - output, W = LinearLayer("Generator.4", DIM, n_feats, output, w_initialization=W4) - outs["FC4"] = output - weights["FC4"] = W - return output, outs, weights - - -def Discriminator(inputs, params=None): - n_feats = 2 - W1 = W2 = W3 = W4 = "he" - if params is not None: - W1 = params["critic"]["FC1"]["W"] - W2 = params["critic"]["FC2"]["W"] - W3 = params["critic"]["FC3"]["W"] - W4 = params["critic"]["FC4"]["W"] - DIM = params["g_hidden"] - n_feats = params["n_in"] - - outs = {} - weights = {} - output, W = ReLULayer("Discriminator.1", n_feats, DIM, inputs, w_initialization=W1) - outs["FC1"] = output - weights["FC1"] = W - - output, W = ReLULayer("Discriminator.2", DIM, DIM, output, w_initialization=W2) - outs["FC2"] = output - weights["FC2"] = W - - output, W = ReLULayer("Discriminator.3", DIM, DIM, output, w_initialization=W3) - outs["FC3"] = output - weights["FC3"] = W - - output, W = LinearLayer("Discriminator.4", DIM, 1, output, w_initialization=W4) - outs["FC4"] = output - weights["FC4"] = W - - # get bias - for var in params_with_name("Discriminator"): - if "1.b:" in var.name: - weights["FC1_b"] = var - elif "2.b:" in var.name: - weights["FC2_b"] = var - elif "3.b:" in var.name: - weights["FC3_b"] = var - elif "4.b:" in var.name: - weights["FC4_b"] = var - - return tf.reshape(output, [-1]), outs, weights - - -def WGAN_GP_tf(X, lambda_, params, batch_size): - tf.compat.v1.disable_eager_execution() - - batch_size = X.shape[0] - - # get alpha value - n_steps = params["n_steps"] - c_updates_per_epoch = params["c_updates_per_epoch"] - alpha = tf.convert_to_tensor(params["alpha"], dtype="float32") - - X_real = tf.compat.v1.placeholder(tf.float32, shape=[None, params["n_in"]]) - X_fake, G_out_X_fake, G_weights = Generator(batch_size, X_real, params) - - Y_real, C_out_Y_real, C_Y_real_weights = Discriminator(X_real, params) - Y_fake, C_out_Y_fake, C_Y_fake_weights = Discriminator(X_fake, params) - - # WGAN loss - mean_fake = tf.reduce_mean(Y_fake) - mean_real = tf.reduce_mean(Y_real) - - C_loss = tf.reduce_mean(Y_fake) - tf.reduce_mean(Y_real) - G_loss = -tf.reduce_mean(Y_fake) - - # WGAN gradient penalty - X_interp = alpha * X_real + ((1 - alpha) * X_fake) - Y_interp, C_out_Y_interp, C_Y_interp_weights = Discriminator(X_interp, params) - gradInterp = tf.gradients(Y_interp, [X_interp])[0] - - norm_gradInterp = tf.sqrt( - tf.compat.v1.reduce_sum(tf.square(gradInterp), reduction_indices=[1]) - ) - gradient_penalty = tf.reduce_mean((norm_gradInterp - 1) ** 2) - C_loss += lambda_ * gradient_penalty - - # extract gradient of Y_interp wrt. each layer output in critic - C_bwd_Y_interp = {} - for k, v in C_out_Y_interp.items(): - C_bwd_Y_interp[k] = tf.gradients(Y_interp, [v])[0] - - C_bwd_W = {} - for k, v in C_Y_interp_weights.items(): - C_bwd_W[k] = tf.gradients(C_loss, [v])[0] - - # get gradients - dC_Y_fake = tf.gradients(C_loss, [Y_fake])[0] - dC_Y_real = tf.gradients(C_loss, [Y_real])[0] - dC_gradInterp = tf.gradients(C_loss, [gradInterp])[0] - dG_Y_fake = tf.gradients(G_loss, [Y_fake])[0] - - with tf.compat.v1.Session() as session: - session.run(tf.compat.v1.global_variables_initializer()) - - for iteration in range(n_steps): - # Train critic - for i in range(c_updates_per_epoch): - _data = X - ( - _alpha, - _X_interp, - _Y_interp, - _gradInterp, - _norm_gradInterp, - _gradient_penalty, - _C_loss, - _X_fake, - _Y_fake, - _Y_real, - _dC_Y_fake, - _dC_Y_real, - _dC_gradInterp, - _dG_Y_fake, - _mean_fake, - _mean_real, - _G_weights_FC1, - _G_weights_FC2, - _G_weights_FC3, - _G_weights_FC4, - _G_fwd_X_fake_FC1, - _G_fwd_X_fake_FC2, - _G_fwd_X_fake_FC3, - _G_fwd_X_fake_FC4, - _C_weights_Y_fake_FC1, - _C_weights_Y_fake_FC2, - _C_weights_Y_fake_FC3, - _C_weights_Y_fake_FC4, - _C_fwd_Y_fake_FC1, - _C_fwd_Y_fake_FC2, - _C_fwd_Y_fake_FC3, - _C_fwd_Y_fake_FC4, - _C_weights_Y_real_FC1, - _C_weights_Y_real_FC2, - _C_weights_Y_real_FC3, - _C_weights_Y_real_FC4, - _C_fwd_Y_real_FC1, - _C_fwd_Y_real_FC2, - _C_fwd_Y_real_FC3, - _C_fwd_Y_real_FC4, - _C_weights_Y_interp_FC1, - _C_weights_Y_interp_FC2, - _C_weights_Y_interp_FC3, - _C_weights_Y_interp_FC4, - _C_dY_interp_wrt_FC1, - _C_dY_interp_wrt_FC2, - _C_dY_interp_wrt_FC3, - _C_dY_interp_wrt_FC4, - _C_fwd_Y_interp_FC1, - _C_fwd_Y_interp_FC2, - _C_fwd_Y_interp_FC3, - _C_fwd_Y_interp_FC4, - _C_dW_FC1, - _C_db_FC1, - _C_dW_FC2, - _C_db_FC2, - _C_dW_FC3, - _C_db_FC3, - _C_dW_FC4, - _C_db_FC4, - ) = session.run( - [ - alpha, - X_interp, - Y_interp, - gradInterp, - norm_gradInterp, - gradient_penalty, - C_loss, - X_fake, - Y_fake, - Y_real, - dC_Y_fake, - dC_Y_real, - dC_gradInterp, - dG_Y_fake, - mean_fake, - mean_real, - G_weights["FC1"], - G_weights["FC2"], - G_weights["FC3"], - G_weights["FC4"], - G_out_X_fake["FC1"], - G_out_X_fake["FC2"], - G_out_X_fake["FC3"], - G_out_X_fake["FC4"], - C_Y_fake_weights["FC1"], - C_Y_fake_weights["FC2"], - C_Y_fake_weights["FC3"], - C_Y_fake_weights["FC4"], - C_out_Y_fake["FC1"], - C_out_Y_fake["FC2"], - C_out_Y_fake["FC3"], - C_out_Y_fake["FC4"], - C_Y_real_weights["FC1"], - C_Y_real_weights["FC2"], - C_Y_real_weights["FC3"], - C_Y_real_weights["FC4"], - C_out_Y_real["FC1"], - C_out_Y_real["FC2"], - C_out_Y_real["FC3"], - C_out_Y_real["FC4"], - C_Y_interp_weights["FC1"], - C_Y_interp_weights["FC2"], - C_Y_interp_weights["FC3"], - C_Y_interp_weights["FC4"], - C_bwd_Y_interp["FC1"], - C_bwd_Y_interp["FC2"], - C_bwd_Y_interp["FC3"], - C_bwd_Y_interp["FC4"], - C_out_Y_interp["FC1"], - C_out_Y_interp["FC2"], - C_out_Y_interp["FC3"], - C_out_Y_interp["FC4"], - C_bwd_W["FC1"], - C_bwd_W["FC1_b"], - C_bwd_W["FC2"], - C_bwd_W["FC2_b"], - C_bwd_W["FC3"], - C_bwd_W["FC3_b"], - C_bwd_W["FC4"], - C_bwd_W["FC4_b"], - ], - feed_dict={X_real: _data}, - ) - - _G_loss = session.run(G_loss, feed_dict={X_real: _data}) - grads = { - "X_real": _data, - "X_interp": _X_interp, - "G_weights_FC1": _G_weights_FC1, - "G_weights_FC2": _G_weights_FC2, - "G_weights_FC3": _G_weights_FC3, - "G_weights_FC4": _G_weights_FC4, - "G_fwd_X_fake_FC1": _G_fwd_X_fake_FC1, - "G_fwd_X_fake_FC2": _G_fwd_X_fake_FC2, - "G_fwd_X_fake_FC3": _G_fwd_X_fake_FC3, - "G_fwd_X_fake_FC4": _G_fwd_X_fake_FC4, - "X_fake": _X_fake, - "C_weights_Y_fake_FC1": _C_weights_Y_fake_FC1, - "C_weights_Y_fake_FC2": _C_weights_Y_fake_FC2, - "C_weights_Y_fake_FC3": _C_weights_Y_fake_FC3, - "C_weights_Y_fake_FC4": _C_weights_Y_fake_FC4, - "C_fwd_Y_fake_FC1": _C_fwd_Y_fake_FC1, - "C_fwd_Y_fake_FC2": _C_fwd_Y_fake_FC2, - "C_fwd_Y_fake_FC3": _C_fwd_Y_fake_FC3, - "C_fwd_Y_fake_FC4": _C_fwd_Y_fake_FC4, - "Y_fake": _Y_fake, - "C_weights_Y_real_FC1": _C_weights_Y_real_FC1, - "C_weights_Y_real_FC2": _C_weights_Y_real_FC2, - "C_weights_Y_real_FC3": _C_weights_Y_real_FC3, - "C_weights_Y_real_FC4": _C_weights_Y_real_FC4, - "C_fwd_Y_real_FC1": _C_fwd_Y_real_FC1, - "C_fwd_Y_real_FC2": _C_fwd_Y_real_FC2, - "C_fwd_Y_real_FC3": _C_fwd_Y_real_FC3, - "C_fwd_Y_real_FC4": _C_fwd_Y_real_FC4, - "Y_real": _Y_real, - "C_weights_Y_interp_FC1": _C_weights_Y_interp_FC1, - "C_weights_Y_interp_FC2": _C_weights_Y_interp_FC2, - "C_weights_Y_interp_FC3": _C_weights_Y_interp_FC3, - "C_weights_Y_interp_FC4": _C_weights_Y_interp_FC4, - "C_fwd_Y_interp_FC1": _C_fwd_Y_interp_FC1, - "C_fwd_Y_interp_FC2": _C_fwd_Y_interp_FC2, - "C_fwd_Y_interp_FC3": _C_fwd_Y_interp_FC3, - "C_fwd_Y_interp_FC4": _C_fwd_Y_interp_FC4, - "Y_interp": _Y_interp, - "dY_interp_wrt_FC1": _C_dY_interp_wrt_FC1, - "dY_interp_wrt_FC2": _C_dY_interp_wrt_FC2, - "dY_interp_wrt_FC3": _C_dY_interp_wrt_FC3, - "dY_interp_wrt_FC4": _C_dY_interp_wrt_FC4, - "gradInterp": _gradInterp, - "gradInterp_norm": _norm_gradInterp, - "G_loss": _G_loss, - "C_loss": _C_loss, - "dC_loss_dW_FC1": _C_dW_FC1, - "dC_loss_db_FC1": _C_db_FC1, - "dC_loss_dW_FC2": _C_dW_FC2, - "dC_loss_db_FC2": _C_db_FC2, - "dC_loss_dW_FC3": _C_dW_FC3, - "dC_loss_db_FC3": _C_db_FC3, - "dC_loss_dW_FC4": _C_dW_FC4, - "dC_loss_db_FC4": _C_db_FC4, - "dC_Y_fake": _dC_Y_fake, - "dC_Y_real": _dC_Y_real, - "dC_gradInterp": _dC_gradInterp, - "dG_Y_fake": _dG_Y_fake, + def params_with_name(name): + return [p for n, p in _params.items() if name in n] + + + def ReLULayer(name, n_in, n_out, inputs, w_initialization): + if isinstance(w_initialization, np.ndarray): + weight_values = w_initialization.astype("float32") + + W = param(name + ".W", weight_values) + result = tf.matmul(inputs, W) + output = tf.nn.bias_add( + result, param(name + ".b", np.zeros((n_out,), dtype="float32")) + ) + output = tf.nn.relu(output) + return output, W + + + def LinearLayer(name, n_in, n_out, inputs, w_initialization): + if isinstance(w_initialization, np.ndarray): + weight_values = w_initialization.astype("float32") + + W = param(name + ".W", weight_values) + result = tf.matmul(inputs, W) + output = tf.nn.bias_add( + result, param(name + ".b", np.zeros((n_out,), dtype="float32")) + ) + return output, W + + + def Generator(n_samples, X_real, params=None): + n_feats = 2 + W1 = W2 = W3 = W4 = "he" + noise = tf.random.normal([n_samples, 2]) + if params is not None: + noise = tf.convert_to_tensor(params["noise"], dtype="float32") + W1 = params["generator"]["FC1"]["W"] + W2 = params["generator"]["FC2"]["W"] + W3 = params["generator"]["FC3"]["W"] + W4 = params["generator"]["FC4"]["W"] + DIM = params["g_hidden"] + n_feats = params["n_in"] + + outs = {} + weights = {} + output, W = ReLULayer("Generator.1", n_feats, DIM, noise, w_initialization=W1) + outs["FC1"] = output + weights["FC1"] = W + output, W = ReLULayer("Generator.2", DIM, DIM, output, w_initialization=W2) + outs["FC2"] = output + weights["FC2"] = W + output, W = ReLULayer("Generator.3", DIM, DIM, output, w_initialization=W3) + outs["FC3"] = output + weights["FC3"] = W + output, W = LinearLayer("Generator.4", DIM, n_feats, output, w_initialization=W4) + outs["FC4"] = output + weights["FC4"] = W + return output, outs, weights + + + def Discriminator(inputs, params=None): + n_feats = 2 + W1 = W2 = W3 = W4 = "he" + if params is not None: + W1 = params["critic"]["FC1"]["W"] + W2 = params["critic"]["FC2"]["W"] + W3 = params["critic"]["FC3"]["W"] + W4 = params["critic"]["FC4"]["W"] + DIM = params["g_hidden"] + n_feats = params["n_in"] + + outs = {} + weights = {} + output, W = ReLULayer("Discriminator.1", n_feats, DIM, inputs, w_initialization=W1) + outs["FC1"] = output + weights["FC1"] = W + + output, W = ReLULayer("Discriminator.2", DIM, DIM, output, w_initialization=W2) + outs["FC2"] = output + weights["FC2"] = W + + output, W = ReLULayer("Discriminator.3", DIM, DIM, output, w_initialization=W3) + outs["FC3"] = output + weights["FC3"] = W + + output, W = LinearLayer("Discriminator.4", DIM, 1, output, w_initialization=W4) + outs["FC4"] = output + weights["FC4"] = W + + # get bias + for var in params_with_name("Discriminator"): + if "1.b:" in var.name: + weights["FC1_b"] = var + elif "2.b:" in var.name: + weights["FC2_b"] = var + elif "3.b:" in var.name: + weights["FC3_b"] = var + elif "4.b:" in var.name: + weights["FC4_b"] = var + + return tf.reshape(output, [-1]), outs, weights + + + def WGAN_GP_tf(X, lambda_, params, batch_size): + tf.compat.v1.disable_eager_execution() + + batch_size = X.shape[0] + + # get alpha value + n_steps = params["n_steps"] + c_updates_per_epoch = params["c_updates_per_epoch"] + alpha = tf.convert_to_tensor(params["alpha"], dtype="float32") + + X_real = tf.compat.v1.placeholder(tf.float32, shape=[None, params["n_in"]]) + X_fake, G_out_X_fake, G_weights = Generator(batch_size, X_real, params) + + Y_real, C_out_Y_real, C_Y_real_weights = Discriminator(X_real, params) + Y_fake, C_out_Y_fake, C_Y_fake_weights = Discriminator(X_fake, params) + + # WGAN loss + mean_fake = tf.reduce_mean(Y_fake) + mean_real = tf.reduce_mean(Y_real) + + C_loss = tf.reduce_mean(Y_fake) - tf.reduce_mean(Y_real) + G_loss = -tf.reduce_mean(Y_fake) + + # WGAN gradient penalty + X_interp = alpha * X_real + ((1 - alpha) * X_fake) + Y_interp, C_out_Y_interp, C_Y_interp_weights = Discriminator(X_interp, params) + gradInterp = tf.gradients(Y_interp, [X_interp])[0] + + norm_gradInterp = tf.sqrt( + tf.compat.v1.reduce_sum(tf.square(gradInterp), reduction_indices=[1]) + ) + gradient_penalty = tf.reduce_mean((norm_gradInterp - 1) ** 2) + C_loss += lambda_ * gradient_penalty + + # extract gradient of Y_interp wrt. each layer output in critic + C_bwd_Y_interp = {} + for k, v in C_out_Y_interp.items(): + C_bwd_Y_interp[k] = tf.gradients(Y_interp, [v])[0] + + C_bwd_W = {} + for k, v in C_Y_interp_weights.items(): + C_bwd_W[k] = tf.gradients(C_loss, [v])[0] + + # get gradients + dC_Y_fake = tf.gradients(C_loss, [Y_fake])[0] + dC_Y_real = tf.gradients(C_loss, [Y_real])[0] + dC_gradInterp = tf.gradients(C_loss, [gradInterp])[0] + dG_Y_fake = tf.gradients(G_loss, [Y_fake])[0] + + with tf.compat.v1.Session() as session: + session.run(tf.compat.v1.global_variables_initializer()) + + for iteration in range(n_steps): + # Train critic + for i in range(c_updates_per_epoch): + _data = X + ( + _alpha, + _X_interp, + _Y_interp, + _gradInterp, + _norm_gradInterp, + _gradient_penalty, + _C_loss, + _X_fake, + _Y_fake, + _Y_real, + _dC_Y_fake, + _dC_Y_real, + _dC_gradInterp, + _dG_Y_fake, + _mean_fake, + _mean_real, + _G_weights_FC1, + _G_weights_FC2, + _G_weights_FC3, + _G_weights_FC4, + _G_fwd_X_fake_FC1, + _G_fwd_X_fake_FC2, + _G_fwd_X_fake_FC3, + _G_fwd_X_fake_FC4, + _C_weights_Y_fake_FC1, + _C_weights_Y_fake_FC2, + _C_weights_Y_fake_FC3, + _C_weights_Y_fake_FC4, + _C_fwd_Y_fake_FC1, + _C_fwd_Y_fake_FC2, + _C_fwd_Y_fake_FC3, + _C_fwd_Y_fake_FC4, + _C_weights_Y_real_FC1, + _C_weights_Y_real_FC2, + _C_weights_Y_real_FC3, + _C_weights_Y_real_FC4, + _C_fwd_Y_real_FC1, + _C_fwd_Y_real_FC2, + _C_fwd_Y_real_FC3, + _C_fwd_Y_real_FC4, + _C_weights_Y_interp_FC1, + _C_weights_Y_interp_FC2, + _C_weights_Y_interp_FC3, + _C_weights_Y_interp_FC4, + _C_dY_interp_wrt_FC1, + _C_dY_interp_wrt_FC2, + _C_dY_interp_wrt_FC3, + _C_dY_interp_wrt_FC4, + _C_fwd_Y_interp_FC1, + _C_fwd_Y_interp_FC2, + _C_fwd_Y_interp_FC3, + _C_fwd_Y_interp_FC4, + _C_dW_FC1, + _C_db_FC1, + _C_dW_FC2, + _C_db_FC2, + _C_dW_FC3, + _C_db_FC3, + _C_dW_FC4, + _C_db_FC4, + ) = session.run( + [ + alpha, + X_interp, + Y_interp, + gradInterp, + norm_gradInterp, + gradient_penalty, + C_loss, + X_fake, + Y_fake, + Y_real, + dC_Y_fake, + dC_Y_real, + dC_gradInterp, + dG_Y_fake, + mean_fake, + mean_real, + G_weights["FC1"], + G_weights["FC2"], + G_weights["FC3"], + G_weights["FC4"], + G_out_X_fake["FC1"], + G_out_X_fake["FC2"], + G_out_X_fake["FC3"], + G_out_X_fake["FC4"], + C_Y_fake_weights["FC1"], + C_Y_fake_weights["FC2"], + C_Y_fake_weights["FC3"], + C_Y_fake_weights["FC4"], + C_out_Y_fake["FC1"], + C_out_Y_fake["FC2"], + C_out_Y_fake["FC3"], + C_out_Y_fake["FC4"], + C_Y_real_weights["FC1"], + C_Y_real_weights["FC2"], + C_Y_real_weights["FC3"], + C_Y_real_weights["FC4"], + C_out_Y_real["FC1"], + C_out_Y_real["FC2"], + C_out_Y_real["FC3"], + C_out_Y_real["FC4"], + C_Y_interp_weights["FC1"], + C_Y_interp_weights["FC2"], + C_Y_interp_weights["FC3"], + C_Y_interp_weights["FC4"], + C_bwd_Y_interp["FC1"], + C_bwd_Y_interp["FC2"], + C_bwd_Y_interp["FC3"], + C_bwd_Y_interp["FC4"], + C_out_Y_interp["FC1"], + C_out_Y_interp["FC2"], + C_out_Y_interp["FC3"], + C_out_Y_interp["FC4"], + C_bwd_W["FC1"], + C_bwd_W["FC1_b"], + C_bwd_W["FC2"], + C_bwd_W["FC2_b"], + C_bwd_W["FC3"], + C_bwd_W["FC3_b"], + C_bwd_W["FC4"], + C_bwd_W["FC4_b"], + ], + feed_dict={X_real: _data}, + ) + + _G_loss = session.run(G_loss, feed_dict={X_real: _data}) + + grads = { + "X_real": _data, + "X_interp": _X_interp, + "G_weights_FC1": _G_weights_FC1, + "G_weights_FC2": _G_weights_FC2, + "G_weights_FC3": _G_weights_FC3, + "G_weights_FC4": _G_weights_FC4, + "G_fwd_X_fake_FC1": _G_fwd_X_fake_FC1, + "G_fwd_X_fake_FC2": _G_fwd_X_fake_FC2, + "G_fwd_X_fake_FC3": _G_fwd_X_fake_FC3, + "G_fwd_X_fake_FC4": _G_fwd_X_fake_FC4, + "X_fake": _X_fake, + "C_weights_Y_fake_FC1": _C_weights_Y_fake_FC1, + "C_weights_Y_fake_FC2": _C_weights_Y_fake_FC2, + "C_weights_Y_fake_FC3": _C_weights_Y_fake_FC3, + "C_weights_Y_fake_FC4": _C_weights_Y_fake_FC4, + "C_fwd_Y_fake_FC1": _C_fwd_Y_fake_FC1, + "C_fwd_Y_fake_FC2": _C_fwd_Y_fake_FC2, + "C_fwd_Y_fake_FC3": _C_fwd_Y_fake_FC3, + "C_fwd_Y_fake_FC4": _C_fwd_Y_fake_FC4, + "Y_fake": _Y_fake, + "C_weights_Y_real_FC1": _C_weights_Y_real_FC1, + "C_weights_Y_real_FC2": _C_weights_Y_real_FC2, + "C_weights_Y_real_FC3": _C_weights_Y_real_FC3, + "C_weights_Y_real_FC4": _C_weights_Y_real_FC4, + "C_fwd_Y_real_FC1": _C_fwd_Y_real_FC1, + "C_fwd_Y_real_FC2": _C_fwd_Y_real_FC2, + "C_fwd_Y_real_FC3": _C_fwd_Y_real_FC3, + "C_fwd_Y_real_FC4": _C_fwd_Y_real_FC4, + "Y_real": _Y_real, + "C_weights_Y_interp_FC1": _C_weights_Y_interp_FC1, + "C_weights_Y_interp_FC2": _C_weights_Y_interp_FC2, + "C_weights_Y_interp_FC3": _C_weights_Y_interp_FC3, + "C_weights_Y_interp_FC4": _C_weights_Y_interp_FC4, + "C_fwd_Y_interp_FC1": _C_fwd_Y_interp_FC1, + "C_fwd_Y_interp_FC2": _C_fwd_Y_interp_FC2, + "C_fwd_Y_interp_FC3": _C_fwd_Y_interp_FC3, + "C_fwd_Y_interp_FC4": _C_fwd_Y_interp_FC4, + "Y_interp": _Y_interp, + "dY_interp_wrt_FC1": _C_dY_interp_wrt_FC1, + "dY_interp_wrt_FC2": _C_dY_interp_wrt_FC2, + "dY_interp_wrt_FC3": _C_dY_interp_wrt_FC3, + "dY_interp_wrt_FC4": _C_dY_interp_wrt_FC4, + "gradInterp": _gradInterp, + "gradInterp_norm": _norm_gradInterp, + "G_loss": _G_loss, + "C_loss": _C_loss, + "dC_loss_dW_FC1": _C_dW_FC1, + "dC_loss_db_FC1": _C_db_FC1, + "dC_loss_dW_FC2": _C_dW_FC2, + "dC_loss_db_FC2": _C_db_FC2, + "dC_loss_dW_FC3": _C_dW_FC3, + "dC_loss_db_FC3": _C_db_FC3, + "dC_loss_dW_FC4": _C_dW_FC4, + "dC_loss_db_FC4": _C_db_FC4, + "dC_Y_fake": _dC_Y_fake, + "dC_Y_real": _dC_Y_real, + "dC_gradInterp": _dC_gradInterp, + "dG_Y_fake": _dG_Y_fake, + } + return grads + + + def TFNCELoss(X, target_word, L): + from tensorflow.python.ops.nn_impl import _compute_sampled_logits + from tensorflow.python.ops.nn_impl import sigmoid_cross_entropy_with_logits + + tf.compat.v1.disable_eager_execution() + + in_embed = tf.compat.v1.placeholder(tf.float32, shape=X.shape) + in_bias = tf.compat.v1.placeholder( + tf.float32, shape=L.parameters["b"].flatten().shape + ) + in_weights = tf.compat.v1.placeholder(tf.float32, shape=L.parameters["W"].shape) + in_target_word = tf.compat.v1.placeholder(tf.int64) + in_neg_samples = tf.compat.v1.placeholder(tf.int32) + in_target_prob = tf.compat.v1.placeholder(tf.float32) + in_neg_samp_prob = tf.compat.v1.placeholder(tf.float32) + + # in_embed = tf.keras.Input(dtype=tf.float32, shape=X.shape) + # in_bias = tf.keras.Input(dtype=tf.float32, shape=L.parameters["b"].flatten().shape) + # in_weights = tf.keras.Input(dtype=tf.float32, shape=L.parameters["W"].shape) + # in_target_word = tf.keras.Input(dtype=tf.int64, shape=()) + # in_neg_samples = tf.keras.Input(dtype=tf.int32, shape=()) + # in_target_prob = tf.keras.Input(dtype=tf.float32, shape=()) + # in_neg_samp_prob = tf.keras.Input(dtype=tf.float32, shape=()) + + feed = { + in_embed: X, + in_weights: L.parameters["W"], + in_target_word: target_word, + in_bias: L.parameters["b"].flatten(), + in_neg_samples: L.derived_variables["noise_samples"][0], + in_target_prob: L.derived_variables["noise_samples"][1], + in_neg_samp_prob: L.derived_variables["noise_samples"][2], } - return grads - - -def TFNCELoss(X, target_word, L): - from tensorflow.python.ops.nn_impl import _compute_sampled_logits - from tensorflow.python.ops.nn_impl import sigmoid_cross_entropy_with_logits - - tf.compat.v1.disable_eager_execution() - - in_embed = tf.compat.v1.placeholder(tf.float32, shape=X.shape) - in_bias = tf.compat.v1.placeholder( - tf.float32, shape=L.parameters["b"].flatten().shape - ) - in_weights = tf.compat.v1.placeholder(tf.float32, shape=L.parameters["W"].shape) - in_target_word = tf.compat.v1.placeholder(tf.int64) - in_neg_samples = tf.compat.v1.placeholder(tf.int32) - in_target_prob = tf.compat.v1.placeholder(tf.float32) - in_neg_samp_prob = tf.compat.v1.placeholder(tf.float32) - - # in_embed = tf.keras.Input(dtype=tf.float32, shape=X.shape) - # in_bias = tf.keras.Input(dtype=tf.float32, shape=L.parameters["b"].flatten().shape) - # in_weights = tf.keras.Input(dtype=tf.float32, shape=L.parameters["W"].shape) - # in_target_word = tf.keras.Input(dtype=tf.int64, shape=()) - # in_neg_samples = tf.keras.Input(dtype=tf.int32, shape=()) - # in_target_prob = tf.keras.Input(dtype=tf.float32, shape=()) - # in_neg_samp_prob = tf.keras.Input(dtype=tf.float32, shape=()) - - feed = { - in_embed: X, - in_weights: L.parameters["W"], - in_target_word: target_word, - in_bias: L.parameters["b"].flatten(), - in_neg_samples: L.derived_variables["noise_samples"][0], - in_target_prob: L.derived_variables["noise_samples"][1], - in_neg_samp_prob: L.derived_variables["noise_samples"][2], - } - - # Compute the NCE loss, using a sample of the negative labels each time. - nce_unreduced = tf.nn.nce_loss( - weights=in_weights, - biases=in_bias, - labels=in_target_word, - inputs=in_embed, - sampled_values=(in_neg_samples, in_target_prob, in_neg_samp_prob), - num_sampled=L.num_negative_samples, - num_classes=L.n_classes, - ) - - loss = tf.reduce_sum(nce_unreduced) - dLdW = tf.gradients(loss, [in_weights])[0] - dLdb = tf.gradients(loss, [in_bias])[0] - dLdX = tf.gradients(loss, [in_embed])[0] - - sampled_logits, sampled_labels = _compute_sampled_logits( - weights=in_weights, - biases=in_bias, - labels=in_target_word, - inputs=in_embed, - sampled_values=(in_neg_samples, in_target_prob, in_neg_samp_prob), - num_sampled=L.num_negative_samples, - num_classes=L.n_classes, - num_true=1, - subtract_log_q=True, - ) - - sampled_losses = sigmoid_cross_entropy_with_logits( - labels=sampled_labels, logits=sampled_logits - ) - - with tf.compat.v1.Session() as session: - session.run(tf.compat.v1.global_variables_initializer()) - ( - _final_loss, - _nce_unreduced, - _dLdW, - _dLdb, - _dLdX, - _sampled_logits, - _sampled_labels, - _sampled_losses, - ) = session.run( - [ - loss, - nce_unreduced, - dLdW, - dLdb, - dLdX, - sampled_logits, - sampled_labels, - sampled_losses, - ], - feed_dict=feed, + + # Compute the NCE loss, using a sample of the negative labels each time. + nce_unreduced = tf.nn.nce_loss( + weights=in_weights, + biases=in_bias, + labels=in_target_word, + inputs=in_embed, + sampled_values=(in_neg_samples, in_target_prob, in_neg_samp_prob), + num_sampled=L.num_negative_samples, + num_classes=L.n_classes, + ) + + loss = tf.reduce_sum(nce_unreduced) + dLdW = tf.gradients(loss, [in_weights])[0] + dLdb = tf.gradients(loss, [in_bias])[0] + dLdX = tf.gradients(loss, [in_embed])[0] + + sampled_logits, sampled_labels = _compute_sampled_logits( + weights=in_weights, + biases=in_bias, + labels=in_target_word, + inputs=in_embed, + sampled_values=(in_neg_samples, in_target_prob, in_neg_samp_prob), + num_sampled=L.num_negative_samples, + num_classes=L.n_classes, + num_true=1, + subtract_log_q=True, + ) + + sampled_losses = sigmoid_cross_entropy_with_logits( + labels=sampled_labels, logits=sampled_logits ) - tf.compat.v1.reset_default_graph() - return { - "final_loss": _final_loss, - "nce_unreduced": _nce_unreduced, - "dLdW": _dLdW, - "dLdb": _dLdb, - "dLdX": _dLdX, - "out_logits": _sampled_logits, - "out_labels": _sampled_labels, - "sampled_loss": _sampled_losses, - } + + with tf.compat.v1.Session() as session: + session.run(tf.compat.v1.global_variables_initializer()) + ( + _final_loss, + _nce_unreduced, + _dLdW, + _dLdb, + _dLdX, + _sampled_logits, + _sampled_labels, + _sampled_losses, + ) = session.run( + [ + loss, + nce_unreduced, + dLdW, + dLdb, + dLdX, + sampled_logits, + sampled_labels, + sampled_losses, + ], + feed_dict=feed, + ) + tf.compat.v1.reset_default_graph() + return { + "final_loss": _final_loss, + "nce_unreduced": _nce_unreduced, + "dLdW": _dLdW, + "dLdb": _dLdb, + "dLdX": _dLdX, + "out_logits": _sampled_logits, + "out_labels": _sampled_labels, + "sampled_loss": _sampled_losses, + } diff --git a/numpy_ml/tests/test_nn.py b/numpy_ml/tests/test_nn.py index 1a42562..d4a894e 100644 --- a/numpy_ml/tests/test_nn.py +++ b/numpy_ml/tests/test_nn.py @@ -1,4 +1,5 @@ # flake8: noqa +import sys import time from copy import deepcopy @@ -10,6 +11,8 @@ # for testing sigmoid from scipy.special import expit +import pytest + import torch import torch.nn as nn import torch.nn.functional as F @@ -28,8 +31,6 @@ ) from .nn_torch_models import ( - TFNCELoss, - WGAN_GP_tf, torch_xe_grad, torch_mse_grad, TorchVAELoss, @@ -56,6 +57,9 @@ TorchMultiHeadedAttentionModule, ) +if "tensorflow" in sys.modules: + from .nn_torch_models import TFNCELoss, WGAN_GP_tf + ####################################################################### # Debug Formatter # ####################################################################### @@ -238,6 +242,7 @@ def test_WGAN_GP_loss(N=5): i += 1 +@pytest.mark.skipif("tensorflow" not in sys.modules, reason="tensorflow not installed") def test_NCELoss(N=1): from numpy_ml.neural_nets.losses import NCELoss from numpy_ml.utils.data_structures import DiscreteSampler @@ -2323,6 +2328,7 @@ def fit_VAE(): BV.fit(X_train, n_epochs=1, verbose=False) +@pytest.mark.skipif("tensorflow" not in sys.modules, reason="tensorflow not installed") def test_WGAN_GP(N=1): from numpy_ml.neural_nets.models.wgan_gp import WGAN_GP