Fastai Course DL From the Foundations CUDA Hooks
Using Cuda Hooks (Lesson 3 Part 3)
#collapse
%load_ext autoreload
%autoreload 2
%matplotlib inline
#collapse
from exp.nb_05b import *
torch.set_num_threads(2)
#collapse
x_train,y_train,x_valid,y_valid = get_data()
Helper function to quickly normalize with the mean and standard deviation from our training set:
#collapse_show
def normalize_to(train, valid):
m,s = train.mean(),train.std()
return normalize(train, m, s), normalize(valid, m, s)
#collapse_show
x_train,x_valid = normalize_to(x_train,x_valid)
train_ds,valid_ds = Dataset(x_train, y_train),Dataset(x_valid, y_valid)
Let's check it behaved properly.
#collapse_show
x_train.mean(),x_train.std()
#collapse_show
nh,bs = 50,512
c = y_train.max().item()+1
loss_func = F.cross_entropy
data = DataBunch(*get_dls(train_ds, valid_ds, bs), c)
To refactor layers, it's useful to have a Lambda
layer that can take a basic function and convert it to a layer you can put in nn.Sequential
.
NB: if you use a Lambda layer with a lambda function, your model won't pickle so you won't be able to save it with PyTorch. So it's best to give a name to the function you're using inside your Lambda (like flatten below).
#collapse_show
class Lambda(nn.Module):
def __init__(self, func):
super().__init__()
self.func = func
def forward(self, x): return self.func(x)
def flatten(x): return x.view(x.shape[0], -1)
This one takes the flat vector of size bs x 784
and puts it back as a batch of images of 28 by 28 pixels:
#collapse_show
def mnist_resize(x): return x.view(-1, 1, 28, 28)
We can now define a simple CNN. With the Lambda class we can now use the resize method in our model directly as well as the flatten method before the fully connected layer.
#collapse_show
def get_cnn_model(data):
return nn.Sequential(
Lambda(mnist_resize),
nn.Conv2d( 1, 8, 5, padding=2,stride=2), nn.ReLU(), #14
nn.Conv2d( 8,16, 3, padding=1,stride=2), nn.ReLU(), # 7
nn.Conv2d(16,32, 3, padding=1,stride=2), nn.ReLU(), # 4
nn.Conv2d(32,32, 3, padding=1,stride=2), nn.ReLU(), # 2
nn.AdaptiveAvgPool2d(1),
Lambda(flatten),
nn.Linear(32,data.c)
)
#collapse_show
model = get_cnn_model(data)
Basic callbacks from the previous notebook:
#collapse_show
cbfs = [Recorder, partial(AvgStatsCallback,accuracy)]
#collapse_show
opt = optim.SGD(model.parameters(), lr=0.4)
learn = Learner(model, opt, loss_func, data)
run = Runner(cb_funcs=cbfs)
#collapse_show
%time run.fit(1, learn)
This took a long time to run, so it's time to use a GPU. A simple Callback can make sure the model, inputs and targets are all on the same device.
#collapse_show
# Somewhat more flexible way
device = torch.device('cuda',0)
#collapse_show
class CudaCallback(Callback):
def __init__(self,device): self.device=device
def begin_fit(self): self.model.to(self.device)
def begin_batch(self): self.run.xb,self.run.yb = self.xb.to(self.device),self.yb.to(self.device)
We can also set the device so we have it as our default :
#collapse_show
torch.cuda.set_device(device)
#collapse_show
class CudaCallback(Callback):
def begin_fit(self): self.model.cuda()
def begin_batch(self): self.run.xb,self.run.yb = self.xb.cuda(),self.yb.cuda()
#collapse
cbfs.append(CudaCallback)
#collapse
model = get_cnn_model(data)
#collapse_show
opt = optim.SGD(model.parameters(), lr=0.4)
learn = Learner(model, opt, loss_func, data)
run = Runner(cb_funcs=cbfs)
#collapse_show
%time run.fit(3, learn)
Now, that's definitely faster!
First we can regroup all the conv/relu in a single function. We also set our default values for stride and filter size.
#collapse_show
def conv2d(ni, nf, ks=3, stride=2):
return nn.Sequential(
nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride), nn.ReLU())
Another thing is that we can do the mnist resize in a batch transform, that we can do with a Callback. We can pass it a transformation function and store it away. In this case we resize our input and can set it as default with the partial function.
#collapse_show
class BatchTransformXCallback(Callback):
_order=2
def __init__(self, tfm): self.tfm = tfm
def begin_batch(self): self.run.xb = self.tfm(self.xb)
def view_tfm(*size):
def _inner(x): return x.view(*((-1,)+size))
return _inner
#collapse_show
mnist_view = view_tfm(1,28,28)
cbfs.append(partial(BatchTransformXCallback, mnist_view))
With the AdaptiveAvgPool
, this model can now work on any size input:
#collapse_show
nfs = [8,16,32,32]
We can pass it the above array and configure the network easily that way.
#collapse_show
def get_cnn_layers(data, nfs):
nfs = [1] + nfs
return [
conv2d(nfs[i], nfs[i+1], 5 if i==0 else 3)
for i in range(len(nfs)-1)
] + [nn.AdaptiveAvgPool2d(1), Lambda(flatten), nn.Linear(nfs[-1], data.c)]
def get_cnn_model(data, nfs): return nn.Sequential(*get_cnn_layers(data, nfs))
And this helper function will quickly give us everything needed to run the training. The kernel size is 5 in the first layer and 3 otherwise. It's because we want to use a larger filter in order to perform useful computation that reduces the amount of actvations and get useful features that way.
#collapse_show
def get_runner(model, data, lr=0.6, cbs=None, opt_func=None, loss_func = F.cross_entropy):
if opt_func is None: opt_func = optim.SGD
opt = opt_func(model.parameters(), lr=lr)
learn = Learner(model, opt, loss_func, data)
return learn, Runner(cb_funcs=listify(cbs))
#collapse
model = get_cnn_model(data, nfs)
learn,run = get_runner(model, data, lr=0.4, cbs=cbfs)
#collapse_show
model
#collapse
run.fit(3, learn)
Let's say we want to do some telemetry, and want the mean and standard deviation of each activations in the model. First we can do it manually like this:
#collapse_show
class SequentialModel(nn.Module):
def __init__(self, *layers):
super().__init__()
self.layers = nn.ModuleList(layers)
self.act_means = [[] for _ in layers]
self.act_stds = [[] for _ in layers]
def __call__(self, x):
for i,l in enumerate(self.layers):
x = l(x)
self.act_means[i].append(x.data.mean())
self.act_stds [i].append(x.data.std ())
return x
def __iter__(self): return iter(self.layers)
#collapse
model = SequentialModel(*get_cnn_layers(data, nfs))
learn,run = get_runner(model, data, lr=0.9, cbs=cbfs)
#collapse
run.fit(2, learn)
Now we can have a look at the means and stds of the activations at the beginning of training. These look awful rn as we do not use any initalization technique so far here.
Means
#collapse_show
for l in model.act_means: plt.plot(l)
plt.legend(range(6));
Std. Deviation
#collapse_show
for l in model.act_stds: plt.plot(l)
plt.legend(range(6));
First 10 means (of first 10 badges)
#collapse_show
for l in model.act_means: plt.plot(l[:10])
plt.legend(range(6));
First 10 Std. deviations (of first 10 badges)
#collapse_show
for l in model.act_stds: plt.plot(l[:10])
plt.legend(range(6));
Hooks are PyTorch object you can add to any nn.Module. A hook will be called when a layer, it is registered to, is executed during the forward pass (forward hook) or the backward pass (backward hook).
Hooks don't require us to rewrite the model. Hooks are essentially PyTorchs Callbacks.
#collapse
model = get_cnn_model(data, nfs)
learn,run = get_runner(model, data, lr=0.5, cbs=cbfs)
#collapse_show
act_means = [[] for _ in model]
act_stds = [[] for _ in model]
A hook is attached to a layer, and needs to have a function that takes three arguments: module, input, output. Here we store the mean and std of the output in the correct position of our list.
#collapse_show
def append_stats(i, mod, inp, outp):
act_means[i].append(outp.data.mean())
act_stds [i].append(outp.data.std())
#collapse_show
for i,m in enumerate(model): m.register_forward_hook(partial(append_stats, i))
#collapse
run.fit(1, learn)
#collapse_show
for o in act_means: plt.plot(o)
plt.legend(range(5));
We can refactor this in a Hook class. It's very important to remove the hooks when they are deleted, otherwise there will be references kept and the memory won't be properly released when your model is deleted.
#collapse_show
def children(m): return list(m.children())
class Hook():
def __init__(self, m, f): self.hook = m.register_forward_hook(partial(f, self))
def remove(self): self.hook.remove()
def __del__(self): self.remove()
def append_stats(hook, mod, inp, outp):
if not hasattr(hook,'stats'): hook.stats = ([],[])
means,stds = hook.stats
means.append(outp.data.mean())
stds .append(outp.data.std())
NB: In fastai we use a bool
param to choose whether to make it a forward or backward hook. In the above version we're only supporting forward hooks.
#collapse
model = get_cnn_model(data, nfs)
learn,run = get_runner(model, data, lr=0.5, cbs=cbfs)
#collapse_show
hooks = [Hook(l, append_stats) for l in children(model[:4])]
#collapse
run.fit(1, learn)
#collapse_show
for h in hooks:
plt.plot(h.stats[0])
h.remove()
plt.legend(range(4));
Let's design our own class that can contain a list of objects. It will behave a bit like a numpy array in the sense that we can index into it via:
- a single index
- a slice (like 1:5)
- a list of indices
- a mask of indices (
[True,False,False,True,...]
)
The __iter__
method is there to be able to do things like for x in ...
.
#collapse_show
class ListContainer():
def __init__(self, items): self.items = listify(items)
def __getitem__(self, idx):
if isinstance(idx, (int,slice)): return self.items[idx]
if isinstance(idx[0],bool):
assert len(idx)==len(self) # bool mask
return [o for m,o in zip(idx,self.items) if m]
return [self.items[i] for i in idx]
def __len__(self): return len(self.items)
def __iter__(self): return iter(self.items)
def __setitem__(self, i, o): self.items[i] = o
def __delitem__(self, i): del(self.items[i])
def __repr__(self):
res = f'{self.__class__.__name__} ({len(self)} items)\n{self.items[:10]}'
if len(self)>10: res = res[:-1]+ '...]'
return res
#collapse_show
ListContainer(range(10))
#collapse
ListContainer(range(100))
#collapse_show
t = ListContainer(range(10))
t[[1,2]], t[[False]*8 + [True,False]]
We can use it to write a Hooks
class that contains several hooks. We will also use it in the next notebook as a container for our objects in the data block API. We add 'del' medthod so a hook is removed when it is not used anymore to free up memory.
#collapse_show
from torch.nn import init
class Hooks(ListContainer):
def __init__(self, ms, f): super().__init__([Hook(m, f) for m in ms])
def __enter__(self, *args): return self
def __exit__ (self, *args): self.remove()
def __del__(self): self.remove()
def __delitem__(self, i):
self[i].remove()
super().__delitem__(i)
def remove(self):
for h in self: h.remove()
#collapse
model = get_cnn_model(data, nfs).cuda()
learn,run = get_runner(model, data, lr=0.9, cbs=cbfs)
#collapse
hooks = Hooks(model, append_stats)
hooks
#collapse
hooks.remove()
#collapse
x,y = next(iter(data.train_dl))
x = mnist_resize(x).cuda()
#collapse
x.mean(),x.std()
#collapse
p = model[0](x)
p.mean(),p.std()
#collapse
for l in model:
if isinstance(l, nn.Sequential):
init.kaiming_normal_(l[0].weight)
l[0].bias.data.zero_()
#collapse
p = model[0](x)
p.mean(),p.std()
Having given an __enter__
and __exit__
method to our Hooks
class, we can use it as a context manager. This makes sure that onces we are out of the with
block, all the hooks have been removed and aren't there to pollute our memory.
#collapse_show
with Hooks(model, append_stats) as hooks:
run.fit(2, learn)
fig,(ax0,ax1) = plt.subplots(1,2, figsize=(10,4))
for h in hooks:
ms,ss = h.stats
ax0.plot(ms[:10])
ax1.plot(ss[:10])
plt.legend(range(6));
fig,(ax0,ax1) = plt.subplots(1,2, figsize=(10,4))
for h in hooks:
ms,ss = h.stats
ax0.plot(ms)
ax1.plot(ss)
plt.legend(range(6));
Let's store more than the means and stds and plot histograms of our activations now.
#collapse_show
def append_stats(hook, mod, inp, outp):
if not hasattr(hook,'stats'): hook.stats = ([],[],[])
means,stds,hists = hook.stats
means.append(outp.data.mean().cpu())
stds .append(outp.data.std().cpu())
hists.append(outp.data.cpu().histc(40,0,10)) #histc isn't implemented on the GPU
#collapse
model = get_cnn_model(data, nfs).cuda()
learn,run = get_runner(model, data, lr=0.9, cbs=cbfs)
#collapse_show
for l in model:
if isinstance(l, nn.Sequential):
init.kaiming_normal_(l[0].weight)
l[0].bias.data.zero_()
#collapse_show
with Hooks(model, append_stats) as hooks: run.fit(1, learn)
#collapse_show
# Thanks to @ste for initial version of histgram plotting code
def get_hist(h): return torch.stack(h.stats[2]).t().float().log1p()
#collapse_show
fig,axes = plt.subplots(2,2, figsize=(15,6))
for ax,h in zip(axes.flatten(), hooks[:4]):
ax.imshow(get_hist(h), origin='lower')
ax.axis('off')
plt.tight_layout()
From the histograms, we can easily get more informations like the min or max of the activations
#collapse_show
def get_min(h):
h1 = torch.stack(h.stats[2]).t().float()
return h1[:2].sum(0)/h1.sum(0)
#collapse_show
fig,axes = plt.subplots(2,2, figsize=(15,6))
for ax,h in zip(axes.flatten(), hooks[:4]):
ax.plot(get_min(h))
ax.set_ylim(0,1)
plt.tight_layout()
Now let's use our model with a generalized ReLU that can be shifted and with maximum value.
#collapse_show
def get_cnn_layers(data, nfs, layer, **kwargs):
nfs = [1] + nfs
return [layer(nfs[i], nfs[i+1], 5 if i==0 else 3, **kwargs)
for i in range(len(nfs)-1)] + [
nn.AdaptiveAvgPool2d(1), Lambda(flatten), nn.Linear(nfs[-1], data.c)]
def conv_layer(ni, nf, ks=3, stride=2, **kwargs):
return nn.Sequential(
nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride), GeneralRelu(**kwargs))
class GeneralRelu(nn.Module):
def __init__(self, leak=None, sub=None, maxv=None):
super().__init__()
self.leak,self.sub,self.maxv = leak,sub,maxv
def forward(self, x):
x = F.leaky_relu(x,self.leak) if self.leak is not None else F.relu(x)
if self.sub is not None: x.sub_(self.sub)
if self.maxv is not None: x.clamp_max_(self.maxv)
return x
def init_cnn(m, uniform=False):
f = init.kaiming_uniform_ if uniform else init.kaiming_normal_
for l in m:
if isinstance(l, nn.Sequential):
f(l[0].weight, a=0.1)
l[0].bias.data.zero_()
def get_cnn_model(data, nfs, layer, **kwargs):
return nn.Sequential(*get_cnn_layers(data, nfs, layer, **kwargs))
#collapse_show
def append_stats(hook, mod, inp, outp):
if not hasattr(hook,'stats'): hook.stats = ([],[],[])
means,stds,hists = hook.stats
means.append(outp.data.mean().cpu())
stds .append(outp.data.std().cpu())
hists.append(outp.data.cpu().histc(40,-7,7))
#collapse_show
model = get_cnn_model(data, nfs, conv_layer, leak=0.1, sub=0.4, maxv=6.)
init_cnn(model)
learn,run = get_runner(model, data, lr=0.9, cbs=cbfs)
#collapse_show
with Hooks(model, append_stats) as hooks:
run.fit(1, learn)
fig,(ax0,ax1) = plt.subplots(1,2, figsize=(10,4))
for h in hooks:
ms,ss,hi = h.stats
ax0.plot(ms[:10])
ax1.plot(ss[:10])
h.remove()
plt.legend(range(5));
fig,(ax0,ax1) = plt.subplots(1,2, figsize=(10,4))
for h in hooks:
ms,ss,hi = h.stats
ax0.plot(ms)
ax1.plot(ss)
plt.legend(range(5));
#collapse_show
fig,axes = plt.subplots(2,2, figsize=(15,6))
for ax,h in zip(axes.flatten(), hooks[:4]):
ax.imshow(get_hist(h), origin='lower')
ax.axis('off')
plt.tight_layout()
#collapse_show
def get_min(h):
h1 = torch.stack(h.stats[2]).t().float()
return h1[19:22].sum(0)/h1.sum(0)
#collapse_show
fig,axes = plt.subplots(2,2, figsize=(15,6))
for ax,h in zip(axes.flatten(), hooks[:4]):
ax.plot(get_min(h))
ax.set_ylim(0,1)
plt.tight_layout()
#collapse_show
def get_learn_run(nfs, data, lr, layer, cbs=None, opt_func=None, uniform=False, **kwargs):
model = get_cnn_model(data, nfs, layer, **kwargs)
init_cnn(model, uniform=uniform)
return get_runner(model, data, lr=lr, cbs=cbs, opt_func=opt_func)
#collapse_show
sched = combine_scheds([0.5, 0.5], [sched_cos(0.2, 1.), sched_cos(1., 0.1)])
#collapse_show
learn,run = get_learn_run(nfs, data, 1., conv_layer, cbs=cbfs+[partial(ParamScheduler,'lr', sched)])
#collapse
run.fit(8, learn)
Uniform init may provide more useful initial weights (normal distribution puts a lot of them at 0).
#collapse
learn,run = get_learn_run(nfs, data, 1., conv_layer, uniform=True,
cbs=cbfs+[partial(ParamScheduler,'lr', sched)])
#collapse
run.fit(8, learn)