TD 1: Single Layer Perceptron#

Goals:

  • check that everyone can run a notebook with appropriate config

  • play around with the single-layer perceptron

Hédi Hadiji, with bits from Odalric Ambryn-Maillard.

December 2023

# Imports
import sys
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import random
from copy import deepcopy

import time

import matplotlib.pyplot as plt

If necessary: install pytorch by running

pip3 install torch

(in a a virtual environment)

print(f"python --version = {sys.version}")
print(f"torch.__version__ = {torch.__version__}")
print(f"np.__version__ = {np.__version__}")
python --version = 3.9.18 (main, Sep 11 2023, 08:25:10) 
[Clang 14.0.6 ]
torch.__version__ = 2.2.1
np.__version__ = 1.26.4

Torch 101#

“The torch package contains data structures for multi-dimensional tensors and defines mathematical operations over these tensors. Additionally, it provides many utilities for efficient serializing of Tensors and arbitrary types, and other useful utilities. […] provides classes and functions implementing automatic differentiation of arbitrary scalar valued functions.” PyTorch

Variable types#

# Very similar syntax to numpy.
zero_torch = torch.zeros((3, 2))

print('zero_torch is of type {:s}'.format(str(type(zero_torch))))

# Torch -> Numpy: simply call the numpy() method.
zero_np = np.zeros((3, 2))
assert (zero_torch.numpy() == zero_np).all()

# Numpy -> Torch: simply call the corresponding function on the np.array.
zero_torch_float = torch.FloatTensor(zero_np)
print('\nFloat:\n', zero_torch_float)
zero_torch_int = torch.LongTensor(zero_np)
print('Int:\n', zero_torch_int)
zero_torch_bool = torch.BoolTensor(zero_np)
print('Bool:\n', zero_torch_bool)

# Reshape
print('\nView new shape...', zero_torch.view(1, 6))
# Note that print(zero_torch.reshape(1, 6)) would work too.
# The difference is in how memory is handled (view imposes contiguity).

# Algebra
a = torch.randn((3, 2))
b = torch.randn((3, 2))
print('\nAlgebraic operations are overloaded:\n', a, '\n+\n', b, '\n=\n', a+b )

# More generally, torch shares the syntax of many attributes and functions with Numpy.
zero_torch is of type <class 'torch.Tensor'>

Float:
 tensor([[0., 0.],
        [0., 0.],
        [0., 0.]])
Int:
 tensor([[0, 0],
        [0, 0],
        [0, 0]])
Bool:
 tensor([[False, False],
        [False, False],
        [False, False]])

View new shape... tensor([[0., 0., 0., 0., 0., 0.]])

Algebraic operations are overloaded:
 tensor([[ 1.0182, -0.6023],
        [-0.4773, -1.3301],
        [-2.4343,  0.6200]]) 
+
 tensor([[ 0.6202,  0.7213],
        [ 1.7796, -0.3127],
        [-0.1968,  0.1540]]) 
=
 tensor([[ 1.6383,  0.1190],
        [ 1.3023, -1.6428],
        [-2.6311,  0.7741]])

Gradient management#

# torch.Tensor is a similar yet more complicated data structure than np.array.
# It is basically a static array of number but may also contain an overlay to 
# handle automatic differentiation (i.e keeping track of the gradient and which 
# tensors depend on which).
# To access the static array embedded in a tensor, simply call the detach() method
print(zero_torch.detach())

# When inside a function performing automatic differentiation (basically when training 
# a neural network), never use detach() otherwise meta information regarding gradients
# will be lost, effectively freezing the variable and preventing backprop for it. 
# However when returning the result of training, do use detach() to save memory 
# (the naked tensor data uses much less memory than the full-blown tensor with gradient
# management, and is much less prone to mistake such as bad copy and memory leak).

# We will solve theta * x = y in theta for x=1 and y=2
x = torch.ones(1)
y = 2 * torch.ones(1)

# Actually by default torch does not add the gradient management overlay
# when declaring tensors like this. To force it, add requires_grad=True.
theta = torch.randn(1, requires_grad=True)

# Optimisation routine
# (Adam is a sophisticated variant of SGD, with adaptive step).
optimizer = optim.Adam(params=[theta], lr=.1)

# Loss function
print('Initial guess:', theta.detach())

for _ in range(100):
    # By default, torch accumulates gradients in memory.
    # To obtain the desired gradient descent beahviour,
    # just clean the cached gradients using the following line:
    optimizer.zero_grad()
    
    # Quadratic loss (* and ** are overloaded so that torch
    # knows how to differentiate them)
    loss = (y - theta * x) ** 2
    
    # Apply the chain rule to automatically compute gradients
    # for all relevant tensors.
    loss.backward()
    
    # Run one step of optimisation routine.
    optimizer.step()
    
print('Final estimate:', theta.detach())
print('The final estimate should be close to', y)
tensor([[0., 0.],
        [0., 0.],
        [0., 0.]])
Initial guess: tensor([2.2107])
Final estimate: tensor([1.9999])
The final estimate should be close to tensor([2.])

Setting up a Perceptron experiment#

1 - Data#

class ToyLinearData(): 
    """
        Toy dataset generating linearly separable data with margin gamma. 
    """
    def __init__(self, n, d, gamma=0, w=None):
        self.n = n 
        self.d = d
        self.gamma = gamma
        if w is None:
            w = torch.normal(torch.zeros(d))
        self.w = w / torch.linalg.norm(w)
       
        self.features = torch.zeros((n, d))
        self.labels = torch.zeros((n))
        self.max_norm = 0
        self.w_margin = 1e10

        self.fill_data()

    def fill_data(self):
        for i in range(self.n):
            x, y = self.new_example()
            self.features[i, :] =  x
            self.labels[i] = y
            self.max_norm = max(torch.norm(x), self.max_norm)
            self.w_margin = min(abs(torch.dot(self.w, x)), self.w_margin)

    def new_example(self):
        """
            Generates a gaussian vector with zero mean and identity / sqrt(d) covariance, then
            moves it in the direction of w or in the opposite direction, to guarantee margin.
        """
        # Gaussian vectors typically have norm around sqrt(d), so we normalise
        x = torch.normal(torch.zeros(self.d)) / np.sqrt(self.d) 
        value = torch.dot(x, self.w)
        if value >= 0: 
            x = x + self.gamma * self.w 
            y = 1
        elif value < 0:
            x = x - self.gamma * self.w 
            y = -1
        return x, y

    def __len__(self):
        return self.n


class ToySphericalData: 
    """
        Generate toy data that is not linearly separable
    """
    def __init__(self, n, d, max_rad=1, threshold_rad=.5):
        self.n = n 
        self.d = d
       
        self.features = torch.zeros((n, d))
        self.labels = torch.zeros((n))

        self.max_rad = max_rad
        self.threshold_rad = threshold_rad

        self.fill_data()

    def fill_data(self):
        for i in range(self.n): 
            r = np.random.rand() * self.max_rad
            x = torch.normal(torch.zeros(d))
            x = r / torch.norm(x) * x
            self.features[i, :] =  x
            if r > self.threshold_rad:
                self.labels[i] = 1
            else: 
                self.labels[i] = -1

    def __len__(self):
        return self.n

def plot_data(data):
    'Scatter plot for d = 2. Quite slow'
    for i in range(len(data)): 
        if data.labels[i] == 1:
            plt.scatter(data.features[i][0], data.features[i][1], color='blue')#, label='+1')
        else: 
            plt.scatter(data.features[i][0], data.features[i][1], color='red')#, label='-1')
n = 200
d = 2
gamma = 0
data = ToyLinearData(n, d, gamma=gamma)
print(f'Margin of w : {data.w_margin: .4f}')
print(f'Max norm : {data.max_norm: .4f}')
print(f'True w : {data.w}')

plot_data(data)

plt.axline((0, 0), slope=-data.w[0] / data.w[1], color='C0', alpha=.5, label='True Hyperplane')

plt.axis('equal')
Margin of w :  0.0003
Max norm :  2.3916
True w : tensor([ 0.9969, -0.0788])
(-2.208884280920029,
 2.5589091002941133,
 -2.0815599977970125,
 2.441225749254227)
_images/63620daa9e5965a5c89085a72cfe903b5679689fd4ec73d6a61e285746f05bc8.png
n = 500
d = 2
data = ToySphericalData(n, d)

plot_data(data)

plt.axis('equal')
(-1.0394978940486908,
 1.0771204650402069,
 -1.0675497114658357,
 1.0754001200199128)
_images/2f356b46f8b1b21e2d55c22ea819f327323474601cb6e84a929fcf52f62362c6.png

2 - Implementing the Perceptron#

class Perceptron(): 
    """
        Single-layer Perceptron
    """
    def __init__(self, dim): 
        self.dim = dim
        self.w = torch.zeros(dim)

        self.margin_mistake_counter = 0

    def predict(self, x): 
        if torch.dot(self.w, x) > 0: 
            return 1
        else: 
            return -1
    
    def update_margin_mistake(self, x, y): 
        self.w += y * x
        self.margin_mistake_counter += 1
def train_perceptron(alg, data, t_lim=100_000): 
    converged = False
    t = 0 
    while not (converged) and t < t_lim:
        shuffled_indices = np.arange(len(data))
        #np.random.shuffle(shuffled_indices)
        clean_pass = True
        for i in shuffled_indices: 
                                                                                                                            if torch.dot(alg.w, x) * y < 1: 
                alg.update_margin_mistake(x, y)
                clean_pass = False
            t += 1
        if clean_pass: 
            converged = True
    return converged
  File <tokenize>:10
    alg.update_margin_mistake(x, y)
    ^
IndentationError: unindent does not match any outer indentation level
n = 100
d = 2
gamma = .5
data = ToyLinearData(n, d, gamma=gamma)
alg = Perceptron(d)

3 - Testing the Perceptron#

n = 100
d = 2
gamma = 0.1
data = ToyLinearData(n, d, gamma=gamma)
alg = Perceptron(d)

Run the code below a few times to see how the Perceptron behaves.#

You can restart and initialize the algorithm again by running the cell above.

plot_data(data)

i = np.random.randint(len(data))
x, y = data.features[i], data.labels[i]
plt.scatter(x[0], x[1], marker='x', s=100, label='Point selected')

plt.axline((0, 0), slope=-alg.w[0] / alg.w[1], color='violet', alpha=.5, label='Hyperplane Before')

if torch.dot(alg.w, x) * y < 1: 
    alg.update_margin_mistake(x, y)
    print('Point selected leads to a margin mistake')
else:
    print('Point selected is well classified with a margin: no update')

plt.axline((0, 0), slope=-alg.w[0] / alg.w[1], color='orange', alpha=.5, label='Hyperplane After')

plt.axline((0, 0), slope=-data.w[0] / data.w[1], color='green', alpha=.5, label='True Hyperplane')

print('Normalized Alg w :',  alg.w / torch.norm(alg.w))
print('True w: ', data.w)

plt.legend()
plt.axis('equal')
plt.show()
Point selected leads to a margin mistake
Normalized Alg w : tensor([0.2283, 0.9736])
True w:  tensor([-0.6492,  0.7607])
_images/718543874fe7fb950e57a51223e97bd5bbfd9547ceb46162b499c6d5b6e3de44.png

Non-separable data#

n, d = 100, 2
data = ToySphericalData(n, d)
plot_data(data)

i = np.random.randint(len(data))
x, y = data.features[i], data.labels[i]
plt.scatter(x[0], x[1], marker='x', s=100, label='Point selected')

plt.axline((0, 0), slope=-alg.w[0] / alg.w[1], color='violet', alpha=.5, label='Hyperplane Before')

if torch.dot(alg.w, x) * y < 1: 
    alg.update_margin_mistake(x, y)
    print('Point selected leads to a margin mistake')
else:
    print('Point selected is well classified with a margin: no update')

plt.axline((0, 0), slope=-alg.w[0] / alg.w[1], color='orange', alpha=.5, label='Hyperplane After')

print('Alg w :',  alg.w)
#print('True w: ', data.w)

plt.legend()
plt.axis('equal')
plt.show()
Point selected leads to a margin mistake
Alg w : tensor([-0.1714,  0.3209])
_images/742e23d8a2775b1ae577418d7ddd97eb344ff4f08da26e5bb6414779238a4284.png

Just run the algorithm in higher dim to check#

n, d, gamma = 100, 2, 0
data = ToyLinearData(n, d, gamma=gamma)

alg = Perceptron(d)
converged = train_perceptron(alg, data, t_lim=1_000_000)
print(f'Converged: {converged}')

print(alg.w / torch.norm(alg.w))
print(data.w)
Converged: True
tensor([0.6950, 0.7190])
tensor([0.6299, 0.7767])
n, d = 200, 10
data = ToySphericalData(n, d)

alg = Perceptron(d)
converged = train_perceptron(alg, data, t_lim=100_000)
print(f'Converged: {converged}')

print(alg.w / torch.norm(alg.w), torch.norm(alg.w))
Converged: False
tensor([-0.3983,  0.5476,  0.0862, -0.0950,  0.4477, -0.0551, -0.2822, -0.2792,
         0.2330, -0.3312]) tensor(4.4352)

2 - Optimization of the Perceptron#

ds = [100 * (i+1) for i in range(20)]
n_ds = len(ds)
N_mc = 20
all_mistakes = np.zeros((n_ds, N_mc))

n = 1000
gamma = .1
norms = np.zeros((n_ds, N_mc))
margins = np.zeros((n_ds, N_mc))

all_converged = True

for i, d in enumerate(ds):
    for j in range(N_mc):
        data = ToyLinearData(n, d, gamma=gamma)
        alg = Perceptron(d)
        converged = train_perceptron(alg, data, t_lim=100_000)
        if not (converged):
            print('Perceptron did not converge')
        all_converged = converged and all_converged
        all_mistakes[i, j] = alg.margin_mistake_counter
        norms[i, j] = data.max_norm
        margins[i, j] = data.w_margin
    
avg_mistakes = np.mean(all_mistakes, axis=1)
std_mistakes = np.std(all_mistakes, axis=1) / np.sqrt(N_mc)

fig, axs = plt.subplots(1, 3, figsize=(12, 4))

axs[0].set_title('Number of margin mistakes')
axs[0].plot(ds, avg_mistakes )
axs[0].fill_between(ds, (avg_mistakes - std_mistakes), (avg_mistakes + std_mistakes), alpha=0.4)
axs[0].set_ylim(0)

axs[1].set_title('Max norm of features')
axs[1].plot(ds, np.mean(norms, axis=1))

axs[2].set_title('Margin of samples')
mean_margins = np.mean(margins, axis=1)
axs[2].plot(ds, mean_margins)

axs[1].set_ylim(0)
axs[2].set_ylim(0, max(1, max(mean_margins)))
plt.show()
_images/72072df9b800cce4fd87d193bd914ac4ebcdb43aed1325cf236e308e74aba995.png

2- Generalization#

n = 10
d = 10
gamma = .1

data = ToyLinearData(n, d, gamma=gamma)
alg = Perceptron(d)
converged = train_perceptron(alg, data, t_lim=100_000)
def test(alg, data, N_test): 
    '''
        Generates a test set and returns the average 
        error on this test set. 
    '''
    score = torch.zeros(N_test)
    for j in range(N_test): 
        x, y = data.new_example()
        score[j] = alg.predict(x) * y
    return 1 - torch.mean(score)
gammas = [.01 * i for i in range(1,30)]
n_train = 10
d = 50

n_gammas = len(gammas)
N_test = 100
errors = torch.zeros(n_gammas)

for i, gamma in enumerate(gammas): 
    data = ToyLinearData(n_train, d, gamma=gamma)
    alg = Perceptron(d)
    converged = train_perceptron(alg, data, t_lim=1_000_000)
    if not (converged):
        print('Perceptron did not converge')
    errors[i] = test(alg, data, N_test)

plt.plot(np.array(gammas), errors)
plt.show()
_images/09c688d137dd8015e0b83053c4614a19419726b16e55322e255c8d85f6c74b82.png
n_trains = [10 * i for i in range(20)]
d = 100

gamma = 0.0

number_of_trainings = len(n_trains)
N_test = 500
errors = torch.zeros(number_of_trainings)

for i, n_train in enumerate(n_trains): 
    data = ToyLinearData(n_train, d, gamma=gamma)
    alg = Perceptron(d)
    converged = train_perceptron(alg, data, t_lim=1_000_000)
    if not (converged):
        print('Perceptron did not converge')
    
    errors[i] = test(alg, data, N_test)

plt.plot(n_trains, errors)
[<matplotlib.lines.Line2D at 0x1652f1c10>]
_images/eaf4b4622f2bfeacbccba9dafcc7d7f9a90cd201a7ba52705d31ebda1cd47dcb.png