#collapse
%load_ext autoreload
%autoreload 2

%matplotlib inline

#collapse
from exp.nb_05b import *
torch.set_num_threads(2)

ConvNet

Jump_to lesson 10 video

#collapse
x_train,y_train,x_valid,y_valid = get_data()

Helper function to quickly normalize with the mean and standard deviation from our training set:

#collapse_show
def normalize_to(train, valid):
    m,s = train.mean(),train.std()
    return normalize(train, m, s), normalize(valid, m, s)

#collapse_show
x_train,x_valid = normalize_to(x_train,x_valid)
train_ds,valid_ds = Dataset(x_train, y_train),Dataset(x_valid, y_valid)

Let's check it behaved properly.

#collapse_show
x_train.mean(),x_train.std()

(tensor(3.0614e-05), tensor(1.))

#collapse_show
nh,bs = 50,512
c = y_train.max().item()+1
loss_func = F.cross_entropy

data = DataBunch(*get_dls(train_ds, valid_ds, bs), c)

To refactor layers, it's useful to have a Lambda layer that can take a basic function and convert it to a layer you can put in nn.Sequential.

NB: if you use a Lambda layer with a lambda function, your model won't pickle so you won't be able to save it with PyTorch. So it's best to give a name to the function you're using inside your Lambda (like flatten below).

#collapse_show
class Lambda(nn.Module):
    def __init__(self, func):
        super().__init__()
        self.func = func

    def forward(self, x): return self.func(x)

def flatten(x):      return x.view(x.shape[0], -1)

This one takes the flat vector of size bs x 784 and puts it back as a batch of images of 28 by 28 pixels:

#collapse_show
def mnist_resize(x): return x.view(-1, 1, 28, 28)

We can now define a simple CNN. With the Lambda class we can now use the resize method in our model directly as well as the flatten method before the fully connected layer.

#collapse_show
def get_cnn_model(data):
    return nn.Sequential(
        Lambda(mnist_resize),
        nn.Conv2d( 1, 8, 5, padding=2,stride=2), nn.ReLU(), #14
        nn.Conv2d( 8,16, 3, padding=1,stride=2), nn.ReLU(), # 7
        nn.Conv2d(16,32, 3, padding=1,stride=2), nn.ReLU(), # 4
        nn.Conv2d(32,32, 3, padding=1,stride=2), nn.ReLU(), # 2
        nn.AdaptiveAvgPool2d(1),
        Lambda(flatten),
        nn.Linear(32,data.c)
    )

#collapse_show
model = get_cnn_model(data)

Basic callbacks from the previous notebook:

#collapse_show
cbfs = [Recorder, partial(AvgStatsCallback,accuracy)]

#collapse_show
opt = optim.SGD(model.parameters(), lr=0.4)
learn = Learner(model, opt, loss_func, data)
run = Runner(cb_funcs=cbfs)

#collapse_show
%time run.fit(1, learn)

train: [2.06754984375, tensor(0.2895)]
valid: [2.14900390625, tensor(0.3016)]
CPU times: user 9.03 s, sys: 547 ms, total: 9.58 s
Wall time: 3.52 s

CUDA

This took a long time to run, so it's time to use a GPU. A simple Callback can make sure the model, inputs and targets are all on the same device.

Jump_to lesson 10 video

#collapse_show
# Somewhat more flexible way
device = torch.device('cuda',0)

#collapse_show
class CudaCallback(Callback):
    def __init__(self,device): self.device=device
    def begin_fit(self): self.model.to(self.device)
    def begin_batch(self): self.run.xb,self.run.yb = self.xb.to(self.device),self.yb.to(self.device)

We can also set the device so we have it as our default :

#collapse_show
torch.cuda.set_device(device)

#collapse_show
class CudaCallback(Callback):
    def begin_fit(self): self.model.cuda()
    def begin_batch(self): self.run.xb,self.run.yb = self.xb.cuda(),self.yb.cuda()

#collapse
cbfs.append(CudaCallback)

#collapse
model = get_cnn_model(data)

#collapse_show
opt = optim.SGD(model.parameters(), lr=0.4)
learn = Learner(model, opt, loss_func, data)
run = Runner(cb_funcs=cbfs)

#collapse_show
%time run.fit(3, learn)

train: [1.9803146875, tensor(0.2964, device='cuda:0')]
valid: [0.7705763671875, tensor(0.7300, device='cuda:0')]
train: [0.4521004296875, tensor(0.8600, device='cuda:0')]
valid: [0.1957219482421875, tensor(0.9439, device='cuda:0')]
train: [0.19470791015625, tensor(0.9407, device='cuda:0')]
valid: [0.1463418701171875, tensor(0.9581, device='cuda:0')]
CPU times: user 4.12 s, sys: 662 ms, total: 4.78 s
Wall time: 5.46 s

Now, that's definitely faster!

Refactor model

First we can regroup all the conv/relu in a single function. We also set our default values for stride and filter size.

Jump_to lesson 10 video

#collapse_show
def conv2d(ni, nf, ks=3, stride=2):
    return nn.Sequential(
        nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride), nn.ReLU())

Another thing is that we can do the mnist resize in a batch transform, that we can do with a Callback. We can pass it a transformation function and store it away. In this case we resize our input and can set it as default with the partial function.

#collapse_show
class BatchTransformXCallback(Callback):
    _order=2
    def __init__(self, tfm): self.tfm = tfm
    def begin_batch(self): self.run.xb = self.tfm(self.xb)

def view_tfm(*size):
    def _inner(x): return x.view(*((-1,)+size))
    return _inner

#collapse_show
mnist_view = view_tfm(1,28,28)
cbfs.append(partial(BatchTransformXCallback, mnist_view))

With the AdaptiveAvgPool, this model can now work on any size input:

#collapse_show
nfs = [8,16,32,32]

We can pass it the above array and configure the network easily that way.

#collapse_show
def get_cnn_layers(data, nfs):
    nfs = [1] + nfs
    return [
        conv2d(nfs[i], nfs[i+1], 5 if i==0 else 3)
        for i in range(len(nfs)-1)
    ] + [nn.AdaptiveAvgPool2d(1), Lambda(flatten), nn.Linear(nfs[-1], data.c)]

def get_cnn_model(data, nfs): return nn.Sequential(*get_cnn_layers(data, nfs))

And this helper function will quickly give us everything needed to run the training. The kernel size is 5 in the first layer and 3 otherwise. It's because we want to use a larger filter in order to perform useful computation that reduces the amount of actvations and get useful features that way.

#collapse_show
def get_runner(model, data, lr=0.6, cbs=None, opt_func=None, loss_func = F.cross_entropy):
    if opt_func is None: opt_func = optim.SGD
    opt = opt_func(model.parameters(), lr=lr)
    learn = Learner(model, opt, loss_func, data)
    return learn, Runner(cb_funcs=listify(cbs))

#collapse
model = get_cnn_model(data, nfs)
learn,run = get_runner(model, data, lr=0.4, cbs=cbfs)

#collapse_show
model

Sequential(
  (0): Sequential(
    (0): Conv2d(1, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
    (1): ReLU()
  )
  (1): Sequential(
    (0): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): ReLU()
  )
  (2): Sequential(
    (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): ReLU()
  )
  (3): Sequential(
    (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): ReLU()
  )
  (4): AdaptiveAvgPool2d(output_size=1)
  (5): Lambda()
  (6): Linear(in_features=32, out_features=10, bias=True)
)

#collapse
run.fit(3, learn)

train: [2.3160621875, tensor(0.1493, device='cuda:0')]
valid: [2.276257421875, tensor(0.1650, device='cuda:0')]
train: [1.86434125, tensor(0.3412, device='cuda:0')]
valid: [0.696328564453125, tensor(0.7789, device='cuda:0')]
train: [0.5320141015625, tensor(0.8298, device='cuda:0')]
valid: [0.275061376953125, tensor(0.9157, device='cuda:0')]

Hooks

Manual insertion

Let's say we want to do some telemetry, and want the mean and standard deviation of each activations in the model. First we can do it manually like this:

Jump_to lesson 10 video

#collapse_show
class SequentialModel(nn.Module):
    def __init__(self, *layers):
        super().__init__()
        self.layers = nn.ModuleList(layers)
        self.act_means = [[] for _ in layers]
        self.act_stds  = [[] for _ in layers]
        
    def __call__(self, x):
        for i,l in enumerate(self.layers):
            x = l(x)
            self.act_means[i].append(x.data.mean())
            self.act_stds [i].append(x.data.std ())
        return x
    
    def __iter__(self): return iter(self.layers)

#collapse
model =  SequentialModel(*get_cnn_layers(data, nfs))
learn,run = get_runner(model, data, lr=0.9, cbs=cbfs)

#collapse
run.fit(2, learn)

train: [1.74529109375, tensor(0.3883, device='cuda:0')]
valid: [0.45245498046875, tensor(0.8549, device='cuda:0')]
train: [0.3530631640625, tensor(0.8905, device='cuda:0')]
valid: [0.138875048828125, tensor(0.9587, device='cuda:0')]

Now we can have a look at the means and stds of the activations at the beginning of training. These look awful rn as we do not use any initalization technique so far here.

Means

#collapse_show
for l in model.act_means: plt.plot(l)
plt.legend(range(6));

Std. Deviation

#collapse_show
for l in model.act_stds: plt.plot(l)
plt.legend(range(6));

First 10 means (of first 10 badges)

#collapse_show
for l in model.act_means: plt.plot(l[:10])
plt.legend(range(6));

First 10 Std. deviations (of first 10 badges)

#collapse_show
for l in model.act_stds: plt.plot(l[:10])
plt.legend(range(6));

Pytorch hooks

Hooks are PyTorch object you can add to any nn.Module. A hook will be called when a layer, it is registered to, is executed during the forward pass (forward hook) or the backward pass (backward hook).

Hooks don't require us to rewrite the model. Hooks are essentially PyTorchs Callbacks.

Jump_to lesson 10 video

#collapse
model = get_cnn_model(data, nfs)
learn,run = get_runner(model, data, lr=0.5, cbs=cbfs)

#collapse_show
act_means = [[] for _ in model]
act_stds  = [[] for _ in model]

A hook is attached to a layer, and needs to have a function that takes three arguments: module, input, output. Here we store the mean and std of the output in the correct position of our list.

#collapse_show
def append_stats(i, mod, inp, outp):
    act_means[i].append(outp.data.mean())
    act_stds [i].append(outp.data.std())

#collapse_show
for i,m in enumerate(model): m.register_forward_hook(partial(append_stats, i))

#collapse
run.fit(1, learn)

train: [1.57182515625, tensor(0.4757, device='cuda:0')]
valid: [0.3583569580078125, tensor(0.8997, device='cuda:0')]

#collapse_show
for o in act_means: plt.plot(o)
plt.legend(range(5));

Hook class

We can refactor this in a Hook class. It's very important to remove the hooks when they are deleted, otherwise there will be references kept and the memory won't be properly released when your model is deleted.

Jump_to lesson 10 video

#collapse_show
def children(m): return list(m.children())

class Hook():
    def __init__(self, m, f): self.hook = m.register_forward_hook(partial(f, self))
    def remove(self): self.hook.remove()
    def __del__(self): self.remove()

def append_stats(hook, mod, inp, outp):
    if not hasattr(hook,'stats'): hook.stats = ([],[])
    means,stds = hook.stats
    means.append(outp.data.mean())
    stds .append(outp.data.std())

NB: In fastai we use a bool param to choose whether to make it a forward or backward hook. In the above version we're only supporting forward hooks.

#collapse
model = get_cnn_model(data, nfs)
learn,run = get_runner(model, data, lr=0.5, cbs=cbfs)

#collapse_show
hooks = [Hook(l, append_stats) for l in children(model[:4])]

#collapse
run.fit(1, learn)

train: [2.12536453125, tensor(0.2445, device='cuda:0')]
valid: [1.049569140625, tensor(0.6825, device='cuda:0')]

#collapse_show
for h in hooks:
    plt.plot(h.stats[0])
    h.remove()
plt.legend(range(4));

A Hooks class

Let's design our own class that can contain a list of objects. It will behave a bit like a numpy array in the sense that we can index into it via:

a single index
a slice (like 1:5)
a list of indices
a mask of indices ([True,False,False,True,...])

The __iter__ method is there to be able to do things like for x in ....

Jump_to lesson 10 video

#collapse_show
class ListContainer():
    def __init__(self, items): self.items = listify(items)
    def __getitem__(self, idx):
        if isinstance(idx, (int,slice)): return self.items[idx]
        if isinstance(idx[0],bool):
            assert len(idx)==len(self) # bool mask
            return [o for m,o in zip(idx,self.items) if m]
        return [self.items[i] for i in idx]
    def __len__(self): return len(self.items)
    def __iter__(self): return iter(self.items)
    def __setitem__(self, i, o): self.items[i] = o
    def __delitem__(self, i): del(self.items[i])
    def __repr__(self):
        res = f'{self.__class__.__name__} ({len(self)} items)\n{self.items[:10]}'
        if len(self)>10: res = res[:-1]+ '...]'
        return res

#collapse_show
ListContainer(range(10))

ListContainer (10 items)
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

#collapse
ListContainer(range(100))

ListContainer (100 items)
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9...]

#collapse_show
t = ListContainer(range(10))
t[[1,2]], t[[False]*8 + [True,False]]

([1, 2], [8])

We can use it to write a Hooks class that contains several hooks. We will also use it in the next notebook as a container for our objects in the data block API. We add 'del' medthod so a hook is removed when it is not used anymore to free up memory.

#collapse_show
from torch.nn import init

class Hooks(ListContainer):
    def __init__(self, ms, f): super().__init__([Hook(m, f) for m in ms])
    def __enter__(self, *args): return self
    def __exit__ (self, *args): self.remove()
    def __del__(self): self.remove()

    def __delitem__(self, i):
        self[i].remove()
        super().__delitem__(i)
        
    def remove(self):
        for h in self: h.remove()

#collapse
model = get_cnn_model(data, nfs).cuda()
learn,run = get_runner(model, data, lr=0.9, cbs=cbfs)

#collapse
hooks = Hooks(model, append_stats)
hooks

Hooks (7 items)
[<__main__.Hook object at 0x7f5af1361390>, <__main__.Hook object at 0x7f5af1361780>, <__main__.Hook object at 0x7f5af1361748>, <__main__.Hook object at 0x7f5af1294198>, <__main__.Hook object at 0x7f5af1294e10>, <__main__.Hook object at 0x7f5af1294748>, <__main__.Hook object at 0x7f5af1294b38>]

#collapse
hooks.remove()

#collapse
x,y = next(iter(data.train_dl))
x = mnist_resize(x).cuda()

#collapse
x.mean(),x.std()

(tensor(0.0050, device='cuda:0'), tensor(1.0050, device='cuda:0'))

#collapse
p = model[0](x)
p.mean(),p.std()

(tensor(0.1675, device='cuda:0', grad_fn=<MeanBackward0>),
 tensor(0.3281, device='cuda:0', grad_fn=<StdBackward0>))

#collapse
for l in model:
    if isinstance(l, nn.Sequential):
        init.kaiming_normal_(l[0].weight)
        l[0].bias.data.zero_()

#collapse
p = model[0](x)
p.mean(),p.std()

(tensor(0.5189, device='cuda:0', grad_fn=<MeanBackward0>),
 tensor(1.1330, device='cuda:0', grad_fn=<StdBackward0>))

Having given an __enter__ and __exit__ method to our Hooks class, we can use it as a context manager. This makes sure that onces we are out of the with block, all the hooks have been removed and aren't there to pollute our memory.

#collapse_show
with Hooks(model, append_stats) as hooks:
    run.fit(2, learn)
    fig,(ax0,ax1) = plt.subplots(1,2, figsize=(10,4))
    for h in hooks:
        ms,ss = h.stats
        ax0.plot(ms[:10])
        ax1.plot(ss[:10])
    plt.legend(range(6));
    
    fig,(ax0,ax1) = plt.subplots(1,2, figsize=(10,4))
    for h in hooks:
        ms,ss = h.stats
        ax0.plot(ms)
        ax1.plot(ss)
    plt.legend(range(6));

train: [1.9525890625, tensor(0.3193, device='cuda:0')]
valid: [1.21211796875, tensor(0.6425, device='cuda:0')]
train: [0.4085144140625, tensor(0.8712, device='cuda:0')]
valid: [0.17522093505859376, tensor(0.9452, device='cuda:0')]

Other statistics

Let's store more than the means and stds and plot histograms of our activations now.

Jump_to lesson 10 video

#collapse_show
def append_stats(hook, mod, inp, outp):
    if not hasattr(hook,'stats'): hook.stats = ([],[],[])
    means,stds,hists = hook.stats
    means.append(outp.data.mean().cpu())
    stds .append(outp.data.std().cpu())
    hists.append(outp.data.cpu().histc(40,0,10)) #histc isn't implemented on the GPU

#collapse
model = get_cnn_model(data, nfs).cuda()
learn,run = get_runner(model, data, lr=0.9, cbs=cbfs)

#collapse_show
for l in model:
    if isinstance(l, nn.Sequential):
        init.kaiming_normal_(l[0].weight)
        l[0].bias.data.zero_()

#collapse_show
with Hooks(model, append_stats) as hooks: run.fit(1, learn)

train: [2.34095421875, tensor(0.1501, device='cuda:0')]
valid: [2.290508203125, tensor(0.1028, device='cuda:0')]

#collapse_show
# Thanks to @ste for initial version of histgram plotting code
def get_hist(h): return torch.stack(h.stats[2]).t().float().log1p()

Jump_to lesson 10 video

#collapse_show
fig,axes = plt.subplots(2,2, figsize=(15,6))
for ax,h in zip(axes.flatten(), hooks[:4]):
    ax.imshow(get_hist(h), origin='lower')
    ax.axis('off')
plt.tight_layout()

From the histograms, we can easily get more informations like the min or max of the activations

#collapse_show
def get_min(h):
    h1 = torch.stack(h.stats[2]).t().float()
    return h1[:2].sum(0)/h1.sum(0)

#collapse_show
fig,axes = plt.subplots(2,2, figsize=(15,6))
for ax,h in zip(axes.flatten(), hooks[:4]):
    ax.plot(get_min(h))
    ax.set_ylim(0,1)
plt.tight_layout()

Generalized ReLU

Now let's use our model with a generalized ReLU that can be shifted and with maximum value.

Jump_to lesson 10 video

#collapse_show
def get_cnn_layers(data, nfs, layer, **kwargs):
    nfs = [1] + nfs
    return [layer(nfs[i], nfs[i+1], 5 if i==0 else 3, **kwargs)
            for i in range(len(nfs)-1)] + [
        nn.AdaptiveAvgPool2d(1), Lambda(flatten), nn.Linear(nfs[-1], data.c)]

def conv_layer(ni, nf, ks=3, stride=2, **kwargs):
    return nn.Sequential(
        nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride), GeneralRelu(**kwargs))

class GeneralRelu(nn.Module):
    def __init__(self, leak=None, sub=None, maxv=None):
        super().__init__()
        self.leak,self.sub,self.maxv = leak,sub,maxv

    def forward(self, x): 
        x = F.leaky_relu(x,self.leak) if self.leak is not None else F.relu(x)
        if self.sub is not None: x.sub_(self.sub)
        if self.maxv is not None: x.clamp_max_(self.maxv)
        return x

def init_cnn(m, uniform=False):
    f = init.kaiming_uniform_ if uniform else init.kaiming_normal_
    for l in m:
        if isinstance(l, nn.Sequential):
            f(l[0].weight, a=0.1)
            l[0].bias.data.zero_()

def get_cnn_model(data, nfs, layer, **kwargs):
    return nn.Sequential(*get_cnn_layers(data, nfs, layer, **kwargs))

#collapse_show
def append_stats(hook, mod, inp, outp):
    if not hasattr(hook,'stats'): hook.stats = ([],[],[])
    means,stds,hists = hook.stats
    means.append(outp.data.mean().cpu())
    stds .append(outp.data.std().cpu())
    hists.append(outp.data.cpu().histc(40,-7,7))

#collapse_show
model =  get_cnn_model(data, nfs, conv_layer, leak=0.1, sub=0.4, maxv=6.)
init_cnn(model)
learn,run = get_runner(model, data, lr=0.9, cbs=cbfs)

#collapse_show
with Hooks(model, append_stats) as hooks:
    run.fit(1, learn)
    fig,(ax0,ax1) = plt.subplots(1,2, figsize=(10,4))
    for h in hooks:
        ms,ss,hi = h.stats
        ax0.plot(ms[:10])
        ax1.plot(ss[:10])
        h.remove()
    plt.legend(range(5));
    
    fig,(ax0,ax1) = plt.subplots(1,2, figsize=(10,4))
    for h in hooks:
        ms,ss,hi = h.stats
        ax0.plot(ms)
        ax1.plot(ss)
    plt.legend(range(5));

train: [0.5544332421875, tensor(0.8234, device='cuda:0')]
valid: [0.15540943603515625, tensor(0.9536, device='cuda:0')]

#collapse_show
fig,axes = plt.subplots(2,2, figsize=(15,6))
for ax,h in zip(axes.flatten(), hooks[:4]):
    ax.imshow(get_hist(h), origin='lower')
    ax.axis('off')
plt.tight_layout()

#collapse_show
def get_min(h):
    h1 = torch.stack(h.stats[2]).t().float()
    return h1[19:22].sum(0)/h1.sum(0)

#collapse_show
fig,axes = plt.subplots(2,2, figsize=(15,6))
for ax,h in zip(axes.flatten(), hooks[:4]):
    ax.plot(get_min(h))
    ax.set_ylim(0,1)
plt.tight_layout()

Jump_to lesson 10 video

#collapse_show
def get_learn_run(nfs, data, lr, layer, cbs=None, opt_func=None, uniform=False, **kwargs):
    model = get_cnn_model(data, nfs, layer, **kwargs)
    init_cnn(model, uniform=uniform)
    return get_runner(model, data, lr=lr, cbs=cbs, opt_func=opt_func)

#collapse_show
sched = combine_scheds([0.5, 0.5], [sched_cos(0.2, 1.), sched_cos(1., 0.1)])

#collapse_show
learn,run = get_learn_run(nfs, data, 1., conv_layer, cbs=cbfs+[partial(ParamScheduler,'lr', sched)])

#collapse
run.fit(8, learn)

train: [1.31843953125, tensor(0.5775, device='cuda:0')]
valid: [0.634657666015625, tensor(0.7989, device='cuda:0')]
train: [0.3491707421875, tensor(0.8969, device='cuda:0')]
valid: [0.718830908203125, tensor(0.7760, device='cuda:0')]
train: [0.22474275390625, tensor(0.9335, device='cuda:0')]
valid: [0.139433447265625, tensor(0.9587, device='cuda:0')]
train: [0.24052806640625, tensor(0.9311, device='cuda:0')]
valid: [0.0933504638671875, tensor(0.9738, device='cuda:0')]
train: [0.087468662109375, tensor(0.9731, device='cuda:0')]
valid: [0.07542120361328125, tensor(0.9769, device='cuda:0')]
train: [0.05863072265625, tensor(0.9822, device='cuda:0')]
valid: [0.06209371337890625, tensor(0.9817, device='cuda:0')]
train: [0.0425904248046875, tensor(0.9867, device='cuda:0')]
valid: [0.05970958251953125, tensor(0.9841, device='cuda:0')]
train: [0.03503335205078125, tensor(0.9896, device='cuda:0')]
valid: [0.05823819580078125, tensor(0.9834, device='cuda:0')]

Uniform init may provide more useful initial weights (normal distribution puts a lot of them at 0).

#collapse
learn,run = get_learn_run(nfs, data, 1., conv_layer, uniform=True,
                          cbs=cbfs+[partial(ParamScheduler,'lr', sched)])

#collapse
run.fit(8, learn)

train: [1.212601953125, tensor(0.6339, device='cuda:0')]
valid: [0.4008196044921875, tensor(0.8711, device='cuda:0')]
train: [0.4163723046875, tensor(0.8766, device='cuda:0')]
valid: [0.9500861328125, tensor(0.7101, device='cuda:0')]
train: [0.31883013671875, tensor(0.9031, device='cuda:0')]
valid: [0.14645821533203124, tensor(0.9561, device='cuda:0')]
train: [0.136221357421875, tensor(0.9582, device='cuda:0')]
valid: [0.09891201171875, tensor(0.9717, device='cuda:0')]
train: [0.0801023388671875, tensor(0.9752, device='cuda:0')]
valid: [0.07433181762695312, tensor(0.9799, device='cuda:0')]
train: [0.0592908984375, tensor(0.9819, device='cuda:0')]
valid: [0.06670298461914062, tensor(0.9803, device='cuda:0')]
train: [0.0450291015625, tensor(0.9864, device='cuda:0')]
valid: [0.0643135009765625, tensor(0.9821, device='cuda:0')]
train: [0.03759386962890625, tensor(0.9889, device='cuda:0')]
valid: [0.06419009399414062, tensor(0.9820, device='cuda:0')]