# preliminary set up. ipympl allows for interactive plots; replace it with inline if not working
#%matplotlib ipympl
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
Edit 7/18/2020:
I wrote this when I was a bit new to the nitty-gritty of ML. If I had this to do all over again, I would simplify the following by just saying to myself: “It is the combination of cross-entropy (or log-likelihood) loss with the sigmoid activation function that gives you the nice property: whereby the gradient becomes monotonic (in the case of either class) – i.e. the system is convex.” And that would be sufficient.
This is a post where I’m investigating convexity a bit, as it relates to neural networks.
Andrew Ng, in Week 3 of his Coursera course on Machine Learning, shows the following image with respect to the “cost function” to be optimized:
Andrew Ng, in Week 3 of his Coursera course on Machine Learning, shows the following image with respect to the “cost function” to be optimized (as slide 14 of Lecture 6):
(image is giving problems, sorry)
I wanted to investigate this: Could I reproduce the two graphs he sketched? The two different loss functions are the mean squared error (MSE or sometimes just SE) and cross entropy (CE):
\[ MSE = {1\over m}\sum_{i=1}^m \left( y_i - h_i \right)^2 \]
\[ CE = - {1\over m}\sum_{i=1}^m \left[ y_i \log(h_i) + (1-y_i) \log(1-h_i) \right] \]
where \(y_i\) are the true values (0 or 1) and \(h_i = h(x_i)\) are the predictions.
TL/DR: No I can’t reproduce his sketches. The graph I get for sum of the squared error (SE) doesn’t have the wiggles that his drawing on the left does. (Perhaps he was just doodling an example of an arbitrary non-convex function, rather than the squared loss in particular?) Takeways at the bottom of this, re. the difference between a convex loss function (by itself) vs. a convex loss for a problem – i.e. the individual terms are convex for either function, but the sum of these terms is actually not strictly convex for either function (for this problem).
I read a few posts about this first… * Math StackExchange: Show that logistic regression with squared loss function is non-convex, which includes a link to this nice demo on Desmos * https://math.stackexchange.com/questions/2193478/loss-function-for-logistic-regression * https://en.wikipedia.org/wiki/Loss_functions_for_classification seems to say that squared loss is convex. ??
…but then wanted to try for myself. As follows:
# basic functions for data operations
def h(x,a,b): # h = logistic function. a is 'weight' and b is 'bias'
return 1/(1 + np.exp(-(a*x + b))) # For code below, a & b should be scalars, x can be anything
def classify_x(x, threshold):
= 0*x
out > threshold] = 1.0
out[x return out
# define some data
= 25
num_x = np.linspace(-5,5,num_x) # _arr denotes"array"
x_arr = 0.7314 # threshold value chosen arbitrarily
threshold = classify_x(x_arr, threshold)
y_arr
# make a prediction
= 10/(x_arr[1]-x_arr[0])
a_guess = -a_guess * threshold
b_guess print("Prediction guess: a =",a_guess,", b =",b_guess)
= np.array(h(x_arr, a_guess, b_guess))
h_arr
# plot the data
= plt.figure()
fig 'o',color='red',label="Truth y")
plt.plot(x_arr, y_arr,'x-',color='green',label="Prediction h")
plt.plot(x_arr, h_arr,
plt.legend() plt.show()
Prediction guess: a = 23.999999999999982 , b = -17.55359999999999
# define a couple loss functions
def calc_se_loss(y_arr, h_arr): # squared error loss
return np.mean( (y_arr - h_arr)**2)
def calc_ce_loss(y_arr, h_arr): # cross-entropy loss, related to Kullback-Liebler divergence
= 1.0e-16 # added to avoid log(0) errors
eps return -np.mean( y_arr*np.log(h_arr) + (1-y_arr)*np.log(1-h_arr+eps) ) # elementwise multiplication
# 1D plot
# define parameter space over which to plot
= 100
num_a = np.linspace(-a_guess,4*a_guess,num_a)
a_arr = b_guess + 0*a_arr # make at the b's all the same value for this first plot.
b_arr
= []
se_loss = []
ce_loss for i in range(a_arr.shape[0]): # cycle through all the values of a and b, getting a different loss for each
= h(x_arr, a_arr[i], b_arr[i])
h_arr
se_loss.append( calc_se_loss(y_arr, h_arr) )
ce_loss.append( calc_ce_loss(y_arr, h_arr) )
# plot 1-d version
= plt.figure()
fig = fig.gca()
ax 'a')
ax.set_xlabel(#ax.set_ylim(2.38, 2.41) # zoom in on flat part on the right
'o-',color='red',label="SE Loss")
plt.plot(a_arr, np.log(se_loss),'o-',color='blue',label="CE Loss")
plt.plot(a_arr, np.log(ce_loss),"log of loss")
ax.set_ylabel(
plt.legend() plt.show()
EDIT (later): A reader pointed out that the use of the log
function distorts the shape of the function and thereby obscures the visual inspection of the convexity. Ooops. See. I was new to this!
In the above figure, it looks like the SE loss goes ‘flat’ for a bit on the left, but never turns upward until after the global minimum. The CE loss…I can see a few places where we could connect two points with a straight line and not have all of the line lie with in the epigraph. Still the lack of flat regions for the blue line would make it preferable for gradient-based optimization.
Plot the full error surface
= 100
num_a # try experimenting: play around with the max & min of the a & b values to see the surface
= np.linspace(-a_guess/2,a_guess*2,num_a)
a_arr = np.linspace(b_guess*2,-b_guess/2,num_a)
b_arr = np.meshgrid(a_arr, b_arr)
A, B
def plot_loss_surf(A, B, x_arr, loss='SE'):
= np.zeros([len(A), len(B)])
Z if ('SE' == loss):
for i in range(len(A)):
for j in range(len(B)):
= h(x_arr, a_arr[i], b_arr[j])
h_arr = calc_se_loss(y_arr, h_arr)
Z[j, i] else:
for i in range(len(A)):
for j in range(len(B)):
= h(x_arr, a_arr[i], b_arr[j])
h_arr = calc_ce_loss(y_arr, h_arr)
Z[j, i]
= plt.figure()
fig = fig.gca(projection='3d')
ax = ax.plot_surface(A, B, np.log(Z), cmap=cm.coolwarm,
surf =0, antialiased=False)
linewidth'a')
ax.set_xlabel('b')
ax.set_ylabel('log('+loss+')')
ax.set_zlabel(#ax.set_zlim(2, 2.5)
+' loss')
ax.set_title(loss#fig.colorbar(surf, shrink=0.5, aspect=5) # Add a color bar which maps values to colors.
plt.show()
='SE')
plot_loss_surf(A,B,x_arr,loss='CE') plot_loss_surf(A,B,x_arr,loss
The Verdict
So, although the individual terms \((y_i - h_i)^2\) and/or \([y_i\log(h_i)+(1-y_i)\log(1-h_i)]\) are individually convex, the sum for either type of loss terms is actually non-convex for this problem. Although neither give rise to unwanted local minima for this problem.
The SE loss, while at least not having any non-global minima, still has multiple significant flat regions that would prove tedious for gradient descent optimiazation, whereas in contrast, the CE loss is smoother and is strictly monotonic on either side of the global minimum. The the CE loss (and/or KL divergence) would be preferable for this problem – you could do it with SE loss assuming you had momentum or some fancy optimization algorithm, but it would take longer and why bother?
Aside: Solve the logistic regression problem using scikit-learn
from sklearn import linear_model
= linear_model.LogisticRegression(C=1e5, solver='lbfgs')
clf
clf.fit(x_arr[:, np.newaxis], y_arr)= clf.coef_[0][0], clf.intercept_[0]
a_fit, b_fit print("Fit paramters: a =",a_fit,", b =",b_fit)
print("Predicted threshold: x = ",-b_fit/a_fit)
Fit paramters: a = 32.449347939033615 , b = -20.23838504197379
Predicted threshold: x = 0.6236915786412106
Afterward: “But you still haven’t found the global minimum!”
In the surface plots above, we see the minimum of the surface going lower and lower – even lower than the supposedly ‘optimum’ parameters we just found via scikit-learn. The reason is that there is no optimal paramter combination: The steepness parameter \(a\) of the sigmoid function \(h(x)\) is only bounded from below by the data in this problem. Thus there is no upper bound. The data will constrain the center of the sigmoid \(x_0 = -b/a\) to some extent (i.e. it needs to lie between two values of \(x_i\)), but other than that…
So how then does the loss function seem to get lower and lower? The steeper the sigmoid function, the more closely its values will approach 0 and 1 on either side. Thus for this problem, the optimal solution is \(a \rightarrow \infty\), with \(b = -({\rm threshold})/a\).