Pytorch Tutorial 5

The reason use the NN is inner kernel of logistic regression is still linear, to avoid the linear relationship, the NN can use activation function, for instance ReLU.

In this case, we use ReLu as our activation function to predict the image, and it can be found that the accuracy is far better than LR, shows more abilities.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
from os import path, mkdir
from random import randint

import torch
import numpy as np
import torchvision
from matplotlib import pyplot as plt
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data.dataloader import DataLoader
import torch.nn.functional as F
import torch.nn as nn

dataset = MNIST(root="./data", download=True, transform=ToTensor())
test_dataset = MNIST(root='./data', train=False, transform=ToTensor())

def split_indices(n, rate):
# create number of validation set
n_val = int(n * rate)
# create shuffled index from 0-n, with no repeat
idxs = np.random.permutation(n)
# retuen (n_val,last) index and (first n_val) index
# i.e. training index and validation index
return idxs[n_val:], idxs[:n_val]

train_indices, val_indices = split_indices(len(dataset), 0.2)

batch_size = 100
train_sampler = SubsetRandomSampler(train_indices)
train_loder = DataLoader(dataset,
batch_size,
sampler=train_sampler)

val_sampler = SubsetRandomSampler(val_indices)
val_loder = DataLoader(dataset,
batch_size,
sampler=val_sampler)

input_size = 28 * 28
num_classes = 10

class MnistModel(nn.Module):

def __init__(self, in_size, hidden_size, out_size):
super().__init__()

self.linear1 = nn.Linear(in_size, hidden_size)

self.linear2 = nn.Linear(hidden_size, out_size)

def forward(self, xb):
# flatten
xb = xb.view(xb.size(0), -1)
# xb = xb.reshape(xb.size(0), -1)
return self.linear2(F.relu(self.linear1(xb)))

# for t in model.parameters():
# print(t.shape)

# for img, labels in train_loder:
# outputs = model(img)
# loss = F.cross_entropy(outputs, labels)
# break

def get_device():
if torch.cuda.is_available():
return torch.device('cuda')
else:
return torch.device('cpu')

def to_device(data, device):
if isinstance(data, (list, tuple)):
return [to_device(x, device) for x in data]
return data.to(device, non_blocking=True)

# for img, label in train_loder:
# print(img.shape)
# img = to_device(img, device)
# print(img.device)
# break

class DeviceDataLoder():
def __init__(self, dl, device):
self.dl = dl
self.device = device

def __iter__(self):
# lazy load here
# instead of load data into device each time, instead, load each batch
for b in self.dl:
yield to_device(b, self.device)

def __len__(self):
return len(self.dl)

# use DeviceDataLoader as warpper
train_dl = DeviceDataLoder(train_loder, get_device())
valid_dl = DeviceDataLoder(val_loder, get_device())

def loss_batch(model, loss_func, xb, yb, opt=None, metric=None):
preds = model(xb)

loss = loss_func(preds, yb)

if opt is not None:
loss.backward()
opt.step()
opt.zero_grad()

metric_result = None
if metric is not None:
metric_result = metric(preds, yb)

return loss.item(), len(xb), metric_result

def evaluate(model, loss_func, valid_dl, metric=None):
with torch.no_grad():
results = [loss_batch(model, loss_func, xb, yb, metric=metric)
for xb, yb in valid_dl]

# separate the lists
loss, nums, metric = zip(*results)
total = np.sum(nums)
avg_loss = np.sum(np.multiply(loss, nums)) / total
avg_metric = None
if metric is not None:
avg_metric = np.sum(np.multiply(metric, nums)) / total
return avg_loss, total, avg_metric

def fit(epochs, lr, model, loss_func, train_dl, valid_dl, opt_fn=None, metric=None):
if opt_fn is None:
opt_fn = torch.optim.SGD
opt = opt_fn(model.parameters(), lr=lr)
loss_history = []
metric_history = []

for epoch in range(epochs):
for xb, yb in train_dl:
loss_batch(model, loss_func, xb, yb, opt)
result = evaluate(model, loss_func, valid_dl, metric)
val_loss, total, val_metric = result

loss_history.append(val_loss)
metric_history.append(val_metric)

if metric is not None:
print(f'Epoch [{epoch + 1}/{epochs}], Loss: {val_loss:.4f}, Metric: {val_metric:.4f}')
else:
print(f'Epoch [{epoch + 1}/{epochs}], Loss: {val_loss:.4f}')

return loss_history, metric_history

def accuracy(output, label):
_, preds = torch.max(output, dim=1)
return torch.sum(label == preds).item() / len(preds)

model = MnistModel(input_size, 32, num_classes)
to_device(model, get_device())

if path.exists('./tutorial5/mnist-logistic.pth'):
model.load_state_dict(torch.load('./tutorial5/mnist-logistic.pth'))

else:
loss_history, metric_history = fit(5, 0.5, model, F.cross_entropy,
train_dl,
valid_dl,
opt_fn=torch.optim.SGD,
metric=accuracy)
# it will save the weight and bias for this model
# new dir
mkdir('./tutorial5')
torch.save(model.state_dict(), './tutorial5/mnist-logistic.pth')

def prediction_img(img, model):
xb = img.unsqueeze(0)
yb = model(xb)
_, preds = torch.max(yb, dim=1)
return preds[0].item()

for i in range(10):
img, label = test_dataset[randint(0, len(test_dataset) - 1)]
img_np = np.array(img)
plt.imshow(img_np.squeeze(), cmap='gray')
plt.show()
print(prediction_img(img, model))

Pytorch Tutorial 3

simple linear regression with bulit in tools in pytorch

  1. generate prediction
  2. calculate the loss
  3. compute gradients of w and b
  4. adjust w and b
  5. reset gradients to zero

these 5 steps also respect to the loop in the next function

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import numpy as np
import torch.nn as nn
import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import torch.nn.functional as F

# temp, rainfall, humidity
# inputs = torch.tensor(np.random.uniform(0, 120, size=(15, 3)))
# the input and output here need to specify the dtype, otherwise, when torch generate the prediction,
# it will encounter the problem of dtype is not match
inputs = torch.tensor(np.array(
[[109.4144, 11.2775, 32.4521], [2.0002, 47.0248, 49.9469], [27.1528, 57.8907, 91.2076],
[44.8227, 71.6239, 64.0752], [66.0968, 92.5966, 94.0775], [59.6257, 76.9701, 92.1656],
[8.1551, 1.7426, 10.5297], [112.6036, 47.2793, 95.4221], [3.2212, 61.8274, 115.9187],
[35.0351, 110.6133, 66.6992], [8.8387, 21.8008, 50.0480], [68.7698, 59.9815, 12.0230],
[111.3881, 90.3050, 62.1327], [101.7462, 115.7447, 33.4925], [27.7659, 54.5803, 105.3599]], dtype='float32'))

# apples, oranges
# targets = torch.tensor(np.random.uniform(0, 50, size=(15, 2)))
targets = torch.tensor(np.array(
[[28.1090, 45.0061], [29.0839, 6.4205], [35.2633, 44.1196],
[29.5371, 6.8457], [7.4298, 36.1434], [6.6296, 47.1809],
[49.9750, 49.9321], [34.1796, 16.6732], [46.8875, 7.6084],
[23.0442, 42.2229], [29.7401, 13.4199], [3.0854, 21.4550],
[47.6801, 49.1518], [18.7320, 18.4418], [34.2725, 25.8721]], dtype='float32'))
# print(inputs)
# print(targets)

# TensorDataset will creat the structure of pairing (input and target) accordingly
train_ds = TensorDataset(inputs, targets)

batch_size = 5
train_dl = DataLoader(train_ds, batch_size, shuffle=True)

# Each batch size is 5, and the data are shuffled
# and is still can contain the pair of data, the structure won't be shuffled
# for xb, yb in train_dl:
# print("batch:")
# print(xb)
# print(yb)

# specify the input and output feature number
model = nn.Linear(3, 2)
# the weight and bias will be initialed automatically, and the parameter of requires_grad will be set as True
# print(model.weight)
# print(model.bias)
# print(list(model.parameters()))

# preds = model(inputs)
# print(preds)

loss_fn = F.mse_loss
loss = loss_fn(model(inputs), targets)
# print(loss)

opt = torch.optim.SGD(model.parameters(), lr=1e-5)

# 1 generate prediction
# 2 calculate the loss
# 3 compute gradients of w and b
# 4 adjust w and b
# 5 reset gradients to zero
# these 5 steps also respect to the loop in the next function

def fit(num_epochs, model, loss_fn, opt):
# training interation
for epoch in range(num_epochs):
# batches in each interation
for xb, yb in train_dl:
pred = model(xb)
loss = loss_fn(pred, yb)
loss.backward()
opt.step()
opt.zero_grad()
if (epoch+1) % 10 == 0:
print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))

fit(100, model, loss_fn, opt)

Pytorch Tutorial 4

  1. load dataset
    1. transform the data into tensor
  2. split the dataset into training, testing, validation datasets
    1. define the function of indices shuffle (the dataset are ordered, if missing apply the shuffle, the individual dataset may only contains one label)
    2. create sampler and loader
  3. customise the MnistModel function
  4. define loss_batch
    1. calculate loss in current batch
  5. define evaluate
    1. calculate average loss in batches
  6. define accuracy
    1. also called metric to shows the accuracy
  7. create fit function
    1. epoch loop
      1. train loop
        1. loss_batch — for train
      2. evaluate result
      3. print result
  8. call fit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from os import path
from random import randint

import torch
import torchvision
from torchvision.datasets import MNIST
import matplotlib.pyplot as plt
import numpy as np
from torch.utils.data.dataloader import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
import torch.nn as nn
import torch.nn.functional as F

# transoforms used to transform the MNIST dataset into tensor in order to torch can work with
import torchvision.transforms as transforms

# here the datasets in original format, can not be understood by torch
datasets = MNIST(root='./data', download=True)
# print(len(datasets))

test_dataset = MNIST(root='./data', train=False, transform=transforms.ToTensor())
# print(len(test_dataset))

# img, label = datasets[0]
# plt.imshow(img, cmap='gray')
# plt.show()

# print(label)

# here the dataset is already transformed into tensor
dataset = MNIST(root='./data', download=True, transform=transforms.ToTensor())

# the shape here is 1,28,28, color, height, weight
# img_tensor, label = dataset[0]
# print(img_tensor.shape, label)

# print(img_tensor[:, 10:15, 10:15])
# print(torch.max(img_tensor), torch.min(img_tensor))
# plt.imshow(img_tensor[0, 10:15, 10:15], cmap='gray')
# plt.show()

def split_indices(n, rate):
# create number of validation set
n_val = int(n * rate)
# create shuffled index from 0-n, with no repeat
idxs = np.random.permutation(n)
# retuen (n_val,last) index and (first n_val) index
# i.e. training index and validation index
return idxs[n_val:], idxs[:n_val]

train_indices, val_indices = split_indices(len(dataset), 0.2)
# print(len(train_indices), len(val_indices))

# the sampler here is randomly select the indices from list with number of batch_size
# the reason for this is lower down the training time and computation
# and utilize multiple epoch to train the model, if not, the training will deal with whole data set,
# that will occupy too much memory space and make too much pressure to computational resources.
# in this case, the training process will transfer to smaller chucks
batch_size = 100
train_sampler = SubsetRandomSampler(train_indices)
train_loder = DataLoader(dataset,
batch_size,
sampler=train_sampler)

val_sampler = SubsetRandomSampler(val_indices)
val_loder = DataLoader(dataset,
batch_size,
sampler=val_sampler)

input_size = 28 * 28
num_classes = 10

# model = nn.Linear(input_size, num_classes)

# print(model.weight.shape)
# print(model.bias.shape)
#
# print(model.weight)
# print(model.bias)

# for img, label in train_loder:
# print(img.shape)
# print(label)
# # there is a error, the shape of image is 1*28*28, but the received input shape was set 784
# # so, the customized model are needed.
# print(model(img))
# break

class MnistModel(nn.Module):
def __init__(self):
super().__init__()
# define the input and output for linear
self.linear = nn.Linear(input_size, num_classes)

def forward(self, xb):
# reshape -1 here avoid the hard code, it will calculate the first dimension number
xb = xb.reshape(-1, input_size)
# pass the batch data to linear layer
out = self.linear(xb)
return out

model = MnistModel()

# the weight and bias are in the linear(model.linear.weight), instead of the model above(model.weight)
# print(model.linear.weight.shape)
# print(model.linear.bias.shape)
#
# print(model.linear.weight)
# print(model.linear.bias)

def accuracy(l1, l2):
return torch.sum(l1 == l2).item() / len(l2)

Log plot presentation

Untitled

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# for img, label in train_loder:
# the img pass in the model shape is 100,1,28,28
# the output shape is 100,10
# which reaches what we expected (represent the 0-9 digital number)
# here the softmax can be introduced to show the possibility with each number correspondingly
# possibility = e^y_i / sum(e^y_i)
# outputs = model(img)
# the second parameter here indicates the dim index need to be applied
# so 0 means the column direction, and 1 for row direction for 2D matrix
# probs = F.softmax(outputs, 1)
# print(probs.shape)
# so now the probs shape is 100,10, but each value each row represent possibility(0-1), and sum of each row is 1
# print(outputs.shape)
# print(outputs[0])
# max_probs, predicted_labels = torch.max(probs, 1)
# print(accuracy(predicted_labels, label))

# now, we need to define the loss function
# here the cross entropy is most suitable for logistic regression
# i.e.
# the true label 9 is represented vector of [0,0,0,0,0,0,0,0,0,1]
# the predict vector [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9] for instance
# and the cross entropy is -ln(y*y_pred) i.e. -ln(1*0.9) = 0.10, which is low

# but, when the prediction is poor
# the true label 1 is represented vector of [0,1,0,0,0,0,0,0,0,0]
# the predict vector [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9] for instance
# and the cross entropy is -ln(y*y_pred) i.e. -ln(1*0.2) = 1.6, which is high

# in the cross entropy, we only consider the right label, and ignore the other, because their vector is 0

# so when low possibility for the correct number the cross entropy(loss) is high, v.v

# define the loss function for current batch
# loss = F.cross_entropy(outputs, label)

# the equation here is -e.pow(right prediction possibility)=loss
# so the right possibility is e.pow(-loss)
# learn_rate = 0.001
# optimizer = torch.optim.SGD(model.parameters(), lr=learn_rate)
# optimizer.step()
# break

def loss_batch(model, loss_func, xb, yb, opt=None, metric=None):
preds = model(xb)
loss = loss_func(preds, yb)

if opt is not None:
loss.backward()
opt.step()
opt.zero_grad()

# metric is used for model evaluation
metric_result = None
if metric is not None:
metric_result = metric(preds, yb)

return loss.item(), len(xb), metric_result

def evaluate(model, loss_func, valid_dl, metric=None):
with torch.no_grad():
results = [loss_batch(model, loss_func, xb, yb, metric=metric)
for xb, yb in valid_dl]

# separate the lists
loss, nums, metric = zip(*results)
total = np.sum(nums)
avg_loss = np.sum(np.multiply(loss, nums)) / total
avg_metric = None
if metric is not None:
avg_metric = np.sum(np.multiply(metric, nums)) / total
return avg_loss, total, avg_metric

def accuracy(output, label):
_, preds = torch.max(output, dim=1)
return torch.sum(label == preds).item() / len(preds)

# avg_loss, total, val_acc = evaluate(model, F.cross_entropy, val_loder, metric=accuracy)
# print("Loss: {:.4f}, total:{:.4f}, Accuracy: {:.4f}".format(avg_loss, total, val_acc))

def fit(epochs, model, loss_fn, opt, train_dl, valid_dl, metric=None):
for epoch in range(epochs):
for xb, yb in train_dl:
loss, _, _ = loss_batch(model, loss_fn, xb, yb, opt, metric=metric)

result = evaluate(model, loss_fn, valid_dl, metric=metric)
val_loss, total, val_metric = result

if metric is None:
print("Epoch [{}/{}], total:{:.4f}, Loss: {:.4f}"
.format(epoch + 1, epochs, total, val_loss, val_metric))
else:
print("Epoch [{}/{}], total:{:.4f}, Loss: {:.4f}, {}: {:.4f}"
.format(epoch + 1, epochs, total, val_loss, metric.__name__, val_metric))

model = MnistModel()

# if path is not blank
if path.exists('mnist-logistic.pth'):
model.load_state_dict(torch.load('mnist-logistic.pth'))

else:
fit(5,
model,
F.cross_entropy,
torch.optim.SGD(model.parameters(), lr=0.001),
train_loder,
val_loder,
metric=accuracy)
# it will save the weight and bias for this model
torch.save(model.state_dict(), 'mnist-logistic.pth')

# read the saved model into instance
# model2 = MnistModel()
# model2.load_state_dict(torch.load('mnist-logistic.pth'))
# model2.state_dict()

def prediction_img(img, model):
xb = img.unsqueeze(0)
yb = model(xb)
_, preds = torch.max(yb, dim=1)
return preds[0].item()

for i in range(10):
img, label = test_dataset[randint(0, len(test_dataset) - 1)]
img_np = np.array(img)
plt.imshow(img_np.squeeze(), cmap='gray')
plt.show()
print(prediction_img(img, model))

Question

  1. when import test_dataset missing the parameter of transform, made the validation section encounter the problem of img no squeeze parameter
  2. zip(*results), used for unpack the tuples, and pass into multiple instances
  3. avg_loss = np.sum(np.multiply(loss, nums)) / total the reason use multiply here is for last batch number, is might not equals to previous number

Pytorch Tutorial 1

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import torch
import numpy as np

t1 = torch.tensor(4.)
print(t1)
print(t1.dtype)

t2 = torch.tensor([1., 2, 3, 4])
print(t2)
print(t2.dtype)
# in this case the all data will be transformed to same data type
# [1., 2., 3., 4.]

t3 = torch.tensor([1., 2, 3, 4])
print(t3)
print(t3.dtype)

t4 = torch.tensor([[1, 2], [1., 4], [4, 3], [5, 6]])
print(t4)
print(t4.dtype)

print(t1.shape)
print(t2.shape)
print(t3.shape)
print(t4.shape)

# ---
x = torch.tensor(3., requires_grad=True)
w = torch.tensor(4., requires_grad=True)
b = torch.tensor(5., requires_grad=True)

y = w * x + b
print(y)
y.backward()

print(x.grad)
print(w.grad)
print(b.grad)

# convert numpy to torch
x = np.array([[1, 2], [2, 4]])

# use shared memory space, not copy
y = torch.from_numpy(x)

# copy data
y = torch.tensor(x)

print(y)
print(y.dtype)

# convert torch to numpy
z = y.numpy()
print(z)

Pytorch Tutorial 2

simple linear regression with auto gradient method in pytorch

  1. @ means inner dot
  2. .t() means transpose matrix
  3. .numel() means number of element in matrix
  4. with torch.no_grad() means code insider this block will not track gradients to save memory and computation time
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import torch
import numpy as np

inputs = np.array([[0, 0, 3],
[0, 1, 9],
[1, 0, 8],
[1, 1, 28]], dtype='float32')

outputs = np.array([[0, 1],
[9, 4],
[7, 3],
[6, 7]], dtype='float32')

inputs = torch.from_numpy(inputs)
outputs = torch.from_numpy(outputs)

w = torch.randn(2, 3, requires_grad=True)
b = torch.randn(2, requires_grad=True)

# print(b)

def model(x):
# the b is the vector, when the matrix plus b, the b will be copy bunch of data to make it as the matrix
return x @ w.t() + b

def mse(t1, t2):
return torch.sum((t1 - t2) ** 2) / t1.numel()

learning_rate = 1e-5
for t in range(500):
y_pred = model(inputs)
loss = mse(y_pred, outputs)
loss.backward()
with torch.no_grad():
w -= learning_rate * w.grad
b -= learning_rate * b.grad
w.grad.zero_()
b.grad.zero_()
print(loss.item())