= [2., 1., 0., 1.]
coeffs = noisy_curve(coeffs, interval=[-3., 1.5], noise=[0., 2.]) x, y
Polynomial fit
1 Fitting a noisy polynomial curve
We now consider the task of fitting a polynomial curve with noise. Despite dealing with higher order polynomials than before, this problem can also be rewritten as a linear regression task. Let us first generate a dataset with the help of the function noisy_curve
, which maps \(x\) to a polynomial of degree \(d\) \(f(x)=\mathbf{w}^T\mathbf{x}+\text{noise}\) and where \(\mathbf{x}=(x^0,x^1,\ldots,x^d)\).
Figure 1 shows the generated data with the ground truth.
Code
= go.Figure()
fig =x, y=y, mode="markers", name='data',
fig.add_scatter(x='x:%{x:.2f}'
hovertemplate+'<br>y:%{y:.2f}</br><extra></extra>')
= np.linspace(x.min(),x.max(),num=50)
x1 = noisy_curve(coeffs,x=x1)
x1, y1 =x1, y=y1, mode="lines",name='Ground Truth')
fig.add_scatter(x=800,height=400,xaxis_title='x',yaxis_title='f(x)')
fig.update_layout(width fig.show()
As in the previous section, we choose the mean square error for the loss function.
2 Polynomial fit as multivariable linear regression
The polynomial regression can be seen as a linear regression of multiple variables with the help of the following trick. Let us rewrite \(f(x)=\sum_{i=0}^d w_i x^i\) as \(f(x)=\mathbf{w}^T\mathbf{x}\), where \(\mathbf{x}=(x^0, x^1, \ldots, x^d)\). We can then see the polynomial fit as a linear regression of the fucntion \(y=\sum_{i=0}^dw_i x_i\), where \(x_i=x^i\).
The mean square error function therefore reads
\[\begin{aligned} MSE &= \sum_{i=1}^N (\mathbf{w}^T\mathbf{x}_i-y_i)^2\\ &= \parallel \mathbf{y}-X\mathbf{w}\parallel^2\\ &=(\mathbf{y}-X\mathbf{w})^T(\mathbf{y}-X\mathbf{w}), \end{aligned}\]where
\[ X= \begin{pmatrix} 1 & x_1^1 & \ldots & x_1^d\\ 1 & x_2^1 & \ldots & x_2^d \\ \vdots& \vdots & \vdots & \vdots\\ 1 & x_N^1 & \ldots & x_N^d \end{pmatrix} .\]
We now take the derivative with respect to all the weights \(w\) and set it to \(0\). We therefore find the estimator
\[w =(X^TX)^{-1}X^T\mathbf{y}.\]
Implement a exact_poly_fit(x, y, degree)
that numerically computes the weights w
with the expression above. You can use np.linalg.inv
to do the matrix inversion.
Code
def exact_poly_fit(x,y,degree):
= np.ones((x.size,degree+1))
mat for i in range(degree):
+1] = x**(i+1)
mat[:,i= np.linalg.inv(np.transpose(mat)@mat)@np.transpose(mat)@y
w return w
Let us run the algorithm for our dataset. We will fit a 3rd degree polynomial and compare the resulting parameters to the original ones.
= exact_poly_fit(x,y,3)
w_best = dict(coeffs=w_best), dict(coeffs=coeffs)
p1, p2 print (np.array([w_best,coeffs]))
print (f'Best parameters: {MSE(x,y,curve,params=p1):.3f}')
print(f'Original parameters: {MSE(x,y,curve,params=p2):.3f}')
[[2.04366107 1.05858036 0.21556575 1.0784744 ]
[2. 1. 0. 1. ]]
Best parameters: 3.767
Original parameters: 3.825
The algorithm does a fairly good job. It is quite interesting to have a look at the mean square error on this dataset. The best parameters have a lower loss than the actual true parameters! We will come back to this point later ;)
3 Stochastic Gradient Descent
Just like we did with the linear regression, we can optimize our model parameters with a gradient-based method. Let us see what we obtain with the stochastic gradient descent algorithm for the poynomial fit. Figure 2 shows the fit after optimization.
= np.random.normal(loc=0,scale=0.1,size=4)
coeffs0
= dict(loss=MSE, grads=grad_MSE_pr, fun=curve)
ll
= dict(coeffs=coeffs0)
pini
= pd.DataFrame(columns=['coeffs','value'])
df
= sgd(x,y, pini, ll, eta=1E-5, niter=int(1E4))
trackers = pd.DataFrame(data={'coeffs':trackers['coeffs'],'value':trackers['loss']})
df1 = pd.concat([df, df1])
df
print(f'final Loss:{df["value"].iloc[-1]:3f}')
print (df["coeffs"].iloc[-1])
print(coeffs)
final Loss:3.981939
[1.29711172 0.64731396 0.65844775 1.27037435]
[2, 1.0, 0, 1]
Code
= df["coeffs"].iloc[-1]
cc
= go.Figure()
fig =x, y=y, mode="markers", name='data',
fig.add_scatter(x='x:%{x:.2f}'
hovertemplate+'<br>y:%{y:.2f}</br><extra></extra>')
= np.linspace(x.min(),x.max(),num=50)
x1 = curve(x1,cc)
y1 =x1, y=y1, mode="lines",name='Fit')
fig.add_scatter(x=800,height=400,xaxis_title='x',yaxis_title='f(x)')
fig.update_layout(width fig.show()
Figure 3 shows how the algotihm adjusts the polynomial curve during the optimization.
Code
= 100
step = np.linspace(x.min(),x.max(),num=50)
x1
= [go.Frame(data=[go.Scatter(x=x1, y=curve(x1,df["coeffs"].iloc[i*step]),mode='lines')],layout=go.Layout(title_text=f'step:{i*step}, MSE:{df["value"].iloc[i*step]:.2f}')) for i in range(len(df)//step)]
frames
= [dict(label="Play",method="animate",
buttons =[None, {"frame": {"duration": 100, "redraw": True},
args"fromcurrent": True,
"transition": {"duration": 300,"easing": "quadratic-in-out"}}]),
dict(label="Pause",method="animate",
=[[None], {"frame": {"duration": 0, "redraw": False},"mode": "immediate","transition": {"duration": 0}}]),
argsdict(label="Restart",method="animate",
=[None,{"frame": {"duration": 100, "redraw": True}}])]
args
= go.Figure(
Fig =[go.Scatter(x=x1, y= curve(x1,df["coeffs"].iloc[0]),mode='lines',name = 'line',
data='x:%{x:.2f}'+'<br>y:%{y:.2f}</br><extra></extra>'),
hovertemplate=x, y=y, mode="markers", name='data',
go.Scatter(x='x:%{x:.2f}'
hovertemplate+'<br>y:%{y:.2f}</br><extra></extra>')],
=go.Layout(
layout=dict(range=[x.min()-2, x.max()+2], autorange=False),
xaxis=dict(range=[y.min()-2, y.max()+2], autorange=False),
yaxis=[dict(
updatemenustype="buttons",
=buttons)]
buttons
),= frames
frames
)='x',yaxis_title='f(x)')
Fig.update_layout(xaxis_title Fig.show()
4 Overfitting
Up until now, we have not discussed a very important hyper-parameter in this problem: the degree of the polynomail. In particular, we have fixed the degree to be the one of the original function. However, this is typically unknown in practice and we either rely on educated guesses or we resort to perform various fits for different degrees and keep the best one (but what is the best one?! We’ll see). To this end, we use the polyfit
subroutine of numpy, which is much more stable than the exact_poly_fit
we prepared.
= []
vec_cc = []
mse_t = []
mse_v
= 20
npoly = 50
ndata for i in np.arange(1,npoly):
=i))
vec_cc.append(polyfit(x[:ndata],y[:ndata],deg= dict(coeffs=vec_cc[i-1]), dict(coeffs=vec_cc[i-1])
p1, p2 =p1))
mse_t.append(MSE(x[:ndata], y[:ndata],curve,params=p2)) mse_v.append(MSE(x[ndata:], y[ndata:],curve,params
Figure 4 shows the loss function over the training data as function of the polynomial degree. At a first glance, it looks like a higher degree gives rise to better loss. However, in Figure 5, we can really see the overfitting of the higher order polynomials. This can be detected by dividing the training set into two subsets: the training set and the validation set. We train the algorithm using data exclusively from the training set and, then, we evaluate its performance on the validation set. The validation set contains new data for our model, which allows us to assess how well our algorithm generalizes to unseen data. We can see that we are overfitting to the training set when we see that the loss in the validation set stagnates or increases. Turn on the orange line in Figure 4 to see it!
Code
= go.Figure()
fig =np.arange(1,npoly), y=mse_t, mode='lines+markers', name='training')
fig.add_scatter(x=np.arange(1,npoly), y=mse_v, mode='lines+markers', visible='legendonly', name='validation')
fig.add_scatter(x=[0,10],xaxis_title='degree',yaxis_title='Loss') fig.update_layout(yaxis_range
Code
= go.Figure()
fig =x[:ndata], y=y[:ndata], mode="markers", name='data',
fig.add_scatter(x='x:%{x:.2f}'
hovertemplate+'<br>y:%{y:.2f}</br><extra></extra>')
= np.linspace(x.min(),x.max(),num=50)
x1
= [1, 2, 3, 4, 6, 8, 10, 19]
poly for i,k in enumerate(poly):
= True if k == 0 else 'legendonly'
visible = noisy_curve(vec_cc[k-1],x=x1)
x1, y1 =x1, y=y1, mode="lines",name=f'{k}th degree', visible=visible)
fig.add_scatter(x=800, height=400, yaxis_range=[y.min(),y.max()], xaxis_title='x', yaxis_title='f(x)')
fig.update_layout(width fig.show()
A typical instance for overfitting appears when we have many free parameters compared to the number of data points. Indeed, we can achieve a zero training loss with some algorithms when we have as many parameters as data points. However, this usually comes at the expense of extreme overfitting and poor generalization.
In the experiment above, we have used 50 data points to fit up to a 19th-degree polynomial. Run the same procedure with increasingly less data, e.g., set ndata
to 30, 20 and 10, and observe what happens with the resulting curves. Do we see overfitting for lower degree polynomials? Do you think these models would provide a reasonable prediction if we drew a new data sample form the same experiment?
We will now discuss two strategies to prevent overfitting.
5 More data
As we have seen right above, the relative number of our model parameters compared to the amount of data that we have is a key factor for overfitting. If having less data makes our model more prone to overfitting, having more data naturally helps us mitigate it.
Therefore, let us generate more samples.
= int(1E3)
nsamples = noisy_curve(coeffs, interval = [-3,1.5], noise=[0.,2], nsamples=nsamples) xn, yn
We then perform the split between the training set and validation set and compute the loss function for both sets.
= [], [], []
vec_cc, mse_t,mse_v
= 20
npoly = int(0.8*nsamples) #We set 80% of the data for the training and 20% for the validation
ndata
for i in np.arange(1,npoly):
=i))
vec_cc.append(polyfit(xn[:ndata],yn[:ndata],deg= dict(coeffs=vec_cc[i-1]), dict(coeffs=vec_cc[i-1])
p1, p2 =p1))
mse_t.append(MSE(xn[:ndata], yn[:ndata], curve, params=p2)) mse_v.append(MSE(xn[ndata:], yn[ndata:], curve, params
Figure 6 shows the comparison between the trainng loss and the validation loss for different degrees. We observe a much better behavior than in the previous case with small dataset. This is also confirmed in Figure 7 where we can clearly see the advantage of using a larger dataset.
Code
= go.Figure()
fig =np.arange(1,npoly), y=mse_t, mode='lines+markers', name='training')
fig.add_scatter(x=np.arange(1,npoly), y=mse_v, mode='lines+markers', visible='legendonly', name='validation')
fig.add_scatter(x=[0,10], xaxis_title='degree', yaxis_title='Loss') fig.update_layout(yaxis_range
Code
= go.Figure()
fig =xn[:ndata], y=yn[:ndata], mode="markers", name='data',
fig.add_scatter(x='x:%{x:.2f}'
hovertemplate+'<br>y:%{y:.2f}</br><extra></extra>')
= np.linspace(x.min(),x.max(),num=50)
x1
= [1, 2, 3, 4, 6, 8, 10, 19]
poly for i,k in enumerate(poly):
= True if k == 0 else 'legendonly'
visible = noisy_curve(vec_cc[k-1],x=x1)
x1, y1 =x1, y=y1, mode="lines",name=f'{k}th degree', visible=visible)
fig.add_scatter(x=800, height=400, yaxis_range=[y.min(),y.max()],xaxis_title='x',yaxis_title='f(x)')
fig.update_layout(width fig.show()
6 Regularization
Another way to avoid overfitting is regularization. The idea here is to add a term to the loss function that prevents the weights to behave in an ill-deined way. An example of such regularization is the \(l_2\) regularization which consists in adding to the loss function the term \(\alpha\parallel \mathbf{w} \parallel^2\). Intituitively, this parabolic term avoids to have exploding weights. The regression with such regularization is called Ridge regression.
For the analytical solution, show that the regularization term gives rise to the following solution
\[w =(X^TX+\alpha \mathbb{1})^{-1}X^T\mathbf{y}.\]
For the gradient descent algorithm, show that the regularization leads to the update rule for each weight \(w\)
\[ w_{i+1} = (1-2\, \alpha\,\eta) w_i - \eta \partial_w L \]
We here use an implementation of scikit learn. To this end, we define a function that creates the matrix \(X\).
def poly_cond(x, n):
= np.zeros((x.size,n))
matx for i,k in enumerate(range(1,n+1)):
= x**k
matx[:,i] return matx
We then perform the Ridge regression for the polynomials of different degrees.
= 0.5
alpha
= [], [], []
vec_cc, mse_t, mse_v
= 20
npoly = 50
ndata for i in np.arange(1,npoly):
= poly_cond(x[:ndata],i)
matx = linear_model.Ridge(alpha=alpha)
reg
reg.fit(matx,y[:ndata])= np.insert(reg.coef_,0,reg.intercept_)
c
vec_cc.append(c)= dict(coeffs=vec_cc[i-1]), dict(coeffs=vec_cc[i-1])
p1, p2 =p1))
mse_t.append(MSE(xn[:ndata], yn[:ndata], curve, params=p2)) mse_v.append(MSE(xn[ndata:], yn[ndata:], curve, params
Figure 8 shows the Loss funtion in terms of the degree of the polynomials. We can now see that the validation curve behaves much better for higher polymomials. The latter is also confirmed with Figure 9, which shows smaller behaviors for higher polynomials.
Code
= go.Figure()
fig =np.arange(1,npoly), y=mse_t, mode='lines+markers', name='training')
fig.add_scatter(x=np.arange(1,npoly), y=mse_v, mode='lines+markers', visible='legendonly', name='validation')
fig.add_scatter(x=[0,10],xaxis_title='Loss',yaxis_title='degree') fig.update_layout(yaxis_range
Code
= go.Figure()
fig =x[:ndata], y=y[:ndata], mode="markers", name='data',
fig.add_scatter(x='x:%{x:.2f}'
hovertemplate+'<br>y:%{y:.2f}</br><extra></extra>')
= np.linspace(x.min(),x.max(),num=50)
x1
= [1, 2, 3, 4, 6, 8, 10, 19]
poly for i,k in enumerate(poly):
= True if k == 0 else 'legendonly'
visible = noisy_curve(vec_cc[k-1],x=x1)
x1, y1 =x1, y=y1, mode="lines",name=f'{k}th degree', visible=visible)
fig.add_scatter(x=800, height=400, yaxis_range=[y.min(),y.max()])
fig.update_layout(width fig.show()
Change the value of \(\alpha\) and perform the same analysis. What do you see?