Fastai Imagenet(te) training

FP16 should allow 2x speed ups in theory, practicaly it also depends on the number of fp16 vs fp32 cores on your GPU.

#collapse
%load_ext autoreload
%autoreload 2

%matplotlib inline
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

#collapse

from exp.nb_10c import *

#collapse
path = Path("/media/cedric/Datasets/imagenette2-160/")
size = 128 
tfms = [make_rgb, RandomResizedCrop(size,scale=(0.35,1)),np_to_float,PilRandomFlip()]

bs = 64

il = ImageList.from_files(path,tfms=tfms)
sd = SplitData.split_by_func(il,partial(grandparent_splitter,valid_name='val'))
ll = label_by_func(sd,parent_labeler,proc_y=CategoryProcessor())

ll.valid.x.tfms = [make_rgb,CenterCrop(size),np_to_float]

data = ll.to_databunch(bs,c_in=3,c_out=10,num_workers=8)

XResNet

#collapse_show
def pass_through(x): return x

class Flatten(nn.Module): 
    def forward(self,x): return x.view(x.size(0),-1)
    
def conv(cin,cout,ks=3, stride=1,bias=False): 
    return nn.Conv2d(cin,cout,kernel_size=ks,stride=stride,padding=ks//2,bias=bias)

#collapse_show
activation = nn.ReLU(inplace=True)

def init_cnn(m): 
    if getattr(m,'bias',None) is not None : nn.init.constant(m.bias,0)
    if isinstance(m,(nn.Conv2d,nn.Linear)): nn.init.kaiming_normal_(m.weight)
    for l in m.children(): init_cnn(l)

def conv_layer(cin,cout,ks=3,stride=1,zero_bn=False,act=True):
    bn = nn.BatchNorm2d(cout)
    nn.init.constant_(bn.weight,0. if zero_bn else 1.)
    layers = [conv(cin,cout,ks,stride),bn]
    if act : layers.append(activation)
    return nn.Sequential(*layers)

#collapse_show
class ResBlock(nn.Module):
    def __init__(self,expansion,ni,nh,stride=1):
        super().__init__()
        nf,ni = nh*expansion,ni*expansion
        #layers
        #smaller nets
        if expansion == 1 : 
                layers = [conv_layer(ni,nh,3,stride=stride), 
                     conv_layer(nh,nf,3,zero_bn=True,act=False)]
        #larger Nets ResNet-D Path A
        else : 
                layers = [conv_layer(ni,nh,1), 
                          conv_layer(ni,nh,3,stride=stride),
                          conv_layer(ni,nf,1,zero_bn=True,act=False)]
                
        self.convs = nn.Sequential(*layers)
        self.idconv = noop if ni == nf else conv_layer(ni,nf,1,act=False)
        self.pool = noop if stride == 1 else nn.AvgPool2d(2,ceil_mode=False)
        
    def forward(self,x): return act_fn(self.convs(x)+ self.idconv(self.pool(x)))
        

ResBlock Details

Batch Norm sometimes has weights of 0 and sometimes 1 during init. It allows us to init the Conv branch to 0 and the identity mapping to 1. The gradient won't explode that way. ResNet 50 and onwards use 3 convs, smaller ones use 2. They also use Bottleneck layers (64 filters-> 16filters -> 64 filters), the normal block for larger ResNets. ResNet-D also uses downsample to make sure the two branches can be added. So when stride is not 1, and AvgPool layer with stride of 2 is deployed for different grid size, and 1x1 conv to change the number of filters (if not equal). images

#collapse_show 
class XResNet(nn.Sequential):
    @classmethod 
    def create(cls,expansion,layers,c_in = 3,c_out=1000): 
        nfs = [c_in,(c_in+1)*8,64,64]
        stem = [conv_layer(nfs[i],nfs[i+1],stride=2 if i==0 else 1)
               for i in range(3)]
        nfs = [64//expansion,64,128,256,512]
        res_layers = [cls._make_layer(expansion,nfs[i],nfs[i+1],
                                     n_blocks=l,stride=1 if i==0 else 2)
                     for i,l in enumerate(layers)]
        res = cls(*stem,nn.MaxPool2d(kernel_size=3,stride=2,padding=1),*res_layers,
                  nn.AdaptiveAvgPool2d(1),Flatten(),nn.Linear(nfs[-1]*expansion,c_out))
        init_cnn(res)
        return res
    
    @staticmethod
    def _make_layer(expansion,ni,nf,n_blocks,stride): 
        return nn.Sequential(*[ResBlock(expansion,ni if i==0 else  nf,nf,stride if i==0 else 1) 
                              for i in range(n_blocks)])
        
        

#collapse_show
def xresnet18 (**kwargs): return XResNet.create(1, [2, 2,  2, 2], **kwargs)
def xresnet34 (**kwargs): return XResNet.create(1, [3, 4,  6, 3], **kwargs)
def xresnet50 (**kwargs): return XResNet.create(4, [3, 4,  6, 3], **kwargs)
def xresnet101(**kwargs): return XResNet.create(4, [3, 4, 23, 3], **kwargs)
def xresnet152(**kwargs): return XResNet.create(4, [3, 8, 36, 3], **kwargs)

Train

#collapse_show
cbfs = [partial(AvgStatsCallback,accuracy), ProgressCallback, CudaCallback,
        partial(BatchTransformXCallback, norm_imagenette),
#         partial(MixUp, alpha=0.2)
       ]

#collapse_show
loss_func = LabelSmoothingCrossEntropy()
arch = partial(xresnet18, c_out=10)
opt_func = adam_opt(mom=0.9, mom_sqr=0.99, eps=1e-6, wd=1e-2)

#collapse_show
def get_batch(dl, learn):
    learn.xb,learn.yb = next(iter(dl))
    learn.do_begin_fit(0)
    learn('begin_batch')
    learn('after_fit')
    return learn.xb,learn.yb

We need to replace the old model_summary since it used to take a Runner.

#collapse_show
def model_summary(model, data, find_all=False, print_mod=False):
    xb,yb = get_batch(data.valid_dl, learn)
    mods = find_modules(model, is_lin_layer) if find_all else model.children()
    f = lambda hook,mod,inp,out: print(f"====\n{mod}\n" if print_mod else "", out.shape)
    with Hooks(mods, f) as hooks: learn.model(xb)

#collapse
learn = Learner(arch(), data, loss_func, lr=1, cb_funcs=cbfs, opt_func=opt_func)
/home/cedric/.conda/envs/fastai/lib/python3.6/site-packages/ipykernel_launcher.py:5: UserWarning: nn.init.constant is now deprecated in favor of nn.init.constant_.
  """

#collapse
learn.model = learn.model.cuda()
model_summary(learn.model, data, print_mod=False)
epoch train_loss train_accuracy valid_loss valid_accuracy time
 torch.Size([128, 32, 64, 64])
 torch.Size([128, 64, 64, 64])
 torch.Size([128, 64, 64, 64])
 torch.Size([128, 64, 32, 32])
 torch.Size([128, 64, 32, 32])
 torch.Size([128, 128, 16, 16])
 torch.Size([128, 256, 8, 8])
 torch.Size([128, 512, 4, 4])
 torch.Size([128, 512, 1, 1])
 torch.Size([128, 512])
 torch.Size([128, 10])

#collapse
arch = partial(xresnet34, c_out=10)

#collapse
learn = Learner(arch(), data, loss_func, lr=1, cb_funcs=cbfs, opt_func=opt_func)
/home/cedric/.conda/envs/fastai/lib/python3.6/site-packages/ipykernel_launcher.py:5: UserWarning: nn.init.constant is now deprecated in favor of nn.init.constant_.
  """

#collapse
learn.fit(1, cbs=[LR_Find(), Recorder()])
<progress value='0' class='' max='1', style='width:300px; height:20px; vertical-align: middle;'></progress>
epoch train_loss train_accuracy valid_loss valid_accuracy time

#collapse
learn.recorder.plot(3)

#collapse_show
def create_phases(phases):
    phases = listify(phases)
    return phases + [1-sum(phases)]

#collapse_show
print(create_phases(0.3))
print(create_phases([0.3,0.2]))
[0.3, 0.7]
[0.3, 0.2, 0.5]

#collapse_show
lr = 1e-2
pct_start = 0.5
phases = create_phases(pct_start)
sched_lr  = combine_scheds(phases, cos_1cycle_anneal(lr/10., lr, lr/1e5))
sched_mom = combine_scheds(phases, cos_1cycle_anneal(0.95, 0.85, 0.95))

#collapse_show
cbsched = [
    ParamScheduler('lr', sched_lr),
    ParamScheduler('mom', sched_mom)]

#collapse_show
learn = Learner(arch(), data, loss_func, lr=lr, cb_funcs=cbfs, opt_func=opt_func)
/home/cedric/.conda/envs/fastai/lib/python3.6/site-packages/ipykernel_launcher.py:5: UserWarning: nn.init.constant is now deprecated in favor of nn.init.constant_.
  """

#collapse_show
learn.fit(5, cbs=cbsched)
<progress value='0' class='' max='5', style='width:300px; height:20px; vertical-align: middle;'></progress>
epoch train_loss train_accuracy valid_loss valid_accuracy time
0 1.747138 0.461189 2.137488 0.448408 00:20
1 1.530650 0.573239 1.779503 0.470064 00:20
2 1.386309 0.634280 1.324897 0.672357 00:20
3 1.207779 0.713275 1.158664 0.729172 00:20
4 1.045097 0.778435 0.997302 0.799745 00:20

cnn_learner

#collapse_show
def cnn_learner(arch, data, loss_func, opt_func, c_in=None, c_out=None,
                lr=1e-2, cuda=True, norm=None, progress=True, mixup=0, xtra_cb=None, **kwargs):
    cbfs = [partial(AvgStatsCallback,accuracy)]+listify(xtra_cb)
    if progress: cbfs.append(ProgressCallback)
    if cuda:     cbfs.append(CudaCallback)
    if norm:     cbfs.append(partial(BatchTransformXCallback, norm))
    if mixup:    cbfs.append(partial(MixUp, mixup))
    arch_args = {}
    if not c_in : c_in  = data.c_in
    if not c_out: c_out = data.c_out
    if c_in:  arch_args['c_in' ]=c_in
    if c_out: arch_args['c_out']=c_out
    return Learner(arch(**arch_args), data, loss_func, opt_func=opt_func, lr=lr, cb_funcs=cbfs, **kwargs)

#collapse_show
learn = cnn_learner(xresnet34, data, loss_func, opt_func, norm=norm_imagenette)

#collapse_show

learn.fit(5, cbsched)
<progress value='0' class='' max='5', style='width:300px; height:20px; vertical-align: middle;'></progress>
epoch train_loss train_accuracy valid_loss valid_accuracy time
0 1.734203 0.474813 1.960827 0.469299 00:18
1 1.525025 0.574506 2.046430 0.447643 00:18
2 1.396191 0.626888 1.789930 0.493248 00:18
3 1.216749 0.706833 1.156404 0.729427 00:18
4 1.043234 0.776956 1.005148 0.792102 00:18

Imagenet

You can see all this put together in the fastai imagenet training script. It's the same as what we've seen so far, except it also handles multi-GPU training. So how well does this work?

We trained for 60 epochs, and got an error of 5.9%, compared to the official PyTorch resnet which gets 7.5% error in 90 epochs! Our xresnet 50 training even surpasses standard resnet 152, which trains for 50% more epochs and has 3x as many layers.