Fastai Course DL from the Foundations Data Augmentation
Data Augmentation with Pillow and PyTorch (Lesson 4 Part 6)
Fastai Data augmentation
- This Post is based on the Notebok by the Fastai Course Part2
#collapse
%load_ext autoreload
%autoreload 2
%matplotlib inline
#collapse
from exp.nb_09c import *
We start with PIL transforms to resize all our images to the same size. Then, when they are in a batch, we can apply data augmentation to all of them at the same time on the GPU. We have already seen the basics of resizing and putting on the GPU in 08, but we'll look more into it now.
#collapse
make_rgb._order=0
#collapse
path = Path("/media/cedric/Datasets/imagenette2-160/")
tfms = [make_rgb, ResizeFixed(128), to_byte_tensor, to_float_tensor]
#collapse
def get_il(tfms): return ImageList.from_files(path, tfms=tfms)
#collapse
il = get_il(tfms)
#collapse
show_image(il[0])
#collapse
img = PIL.Image.open(il.items[0])
#collapse
img
#collapse
img.getpixel((1,1))
#collapse
import numpy as np
#collapse
%timeit -n 10 a = np.array(PIL.Image.open(il.items[0]))
Be careful of resampling methods, you can quickly lose some textures! Sometimes combining methods can be an advantage !
#collapse
img.resize((128,128), resample=PIL.Image.ANTIALIAS)
#collapse
img.resize((128,128), resample=PIL.Image.BILINEAR)
#collapse
img.resize((128,128), resample=PIL.Image.NEAREST)
#collapse_show
img.resize((256,256), resample=PIL.Image.BICUBIC).resize((128,128), resample=PIL.Image.NEAREST)
#collapse_show
img.resize((256,256),resample=PIL.Image.BICUBIC).resize((128,128),resample=PIL.Image.BICUBIC)
#collapse
%timeit img.resize((224,224), resample=PIL.Image.BICUBIC)
#collapse
%timeit img.resize((224,224), resample=PIL.Image.BILINEAR)
#collapse
%timeit -n 10 img.resize((224,224), resample=PIL.Image.NEAREST)
Flip can be done with PIL very fast.
#collapse
import random
#collapse
random.random()
#collapse_show
def pil_random_flip(x):
return x.transpose(PIL.Image.FLIP_LEFT_RIGHT) if random.random()<0.5 else x
#collapse_show
il1 = get_il(tfms)
print(il1)
il1.items = [il1.items[0]]*64
dl = DataLoader(il1, 8)
#collapse
x = next(iter(dl))
Here is a convenience function to look at images in a batch.
#collapse_show
def show_image(im, ax=None, figsize=(3,3)):
if ax is None: _,ax = plt.subplots(1, 1, figsize=figsize)
ax.axis('off')
ax.imshow(im.permute(1,2,0))
def show_batch(x, c=4, r=None, figsize=None):
n = len(x)
(_,h,w) = x[0].shape
if r is None: r = int(math.ceil(n/c))
if figsize is None: figsize=(c*3,r*3)
fig,axes = plt.subplots(r,c, figsize=figsize)
for xi,ax in zip(x,axes.flat): show_image(xi, ax)
Without data augmentation:
#collapse_show
show_batch(x)
With random flip:
#collapse_show
il1.tfms.append(pil_random_flip)
#collapse_show
x = next(iter(dl))
show_batch(x)
We can also make that transform a class so it's easier to set the value of the parameter p
. As seen before, it also allows us to set the _order
attribute.
#collapse_show
class PilRandomFlip(Transform):
_order=11
def __init__(self, p=0.5): self.p=p
def __call__(self, x):
return x.transpose(PIL.Image.FLIP_LEFT_RIGHT) if random.random()<self.p else x
#collapse_show
class PilTransform(Transform): _order=11
class PilRandomFlip(PilTransform):
def __init__(self, p=0.5): self.p=p
def __call__(self, x):
return x.transpose(PIL.Image.FLIP_LEFT_RIGHT) if random.random()<self.p else x
#collapse_show
del(il1.tfms[-1])
il1.tfms.append(PilRandomFlip(0.8))
#collapse_show
x = next(iter(dl))
show_batch(x)
PIL can also do the whole dihedral group of transformations (random horizontal flip, random vertical flip and the four 90 degrees rotation) with the transpose
method. Here are the codes of a few transformations:
#collapse_show
PIL.Image.FLIP_LEFT_RIGHT,PIL.Image.ROTATE_270,PIL.Image.TRANSVERSE
Be careful that img.transpose(0)
is already one transform, so doing nothing requires a separate case, then we have 7 different transformations.
#collapse_show
img = PIL.Image.open(il.items[0])
img = img.resize((128,128), resample=PIL.Image.NEAREST)
_, axs = plt.subplots(2, 4, figsize=(12, 6))
for i,ax in enumerate(axs.flatten()):
if i==0: ax.imshow(img)
else: ax.imshow(img.transpose(i-1))
ax.axis('off')
And we can implement it like this:
#collapse_show
class PilRandomDihedral(PilTransform):
def __init__(self, p=0.75): self.p=p*7/8 #Little hack to get the 1/8 identity dihedral transform taken into account.
def __call__(self, x):
if random.random()>self.p: return x
return x.transpose(random.randint(0,6))
#collapse_show
del(il1.tfms[-1])
il1.tfms.append(PilRandomDihedral())
#collapse_show
show_batch(next(iter(dl)))
#collapse_show
img = PIL.Image.open(il.items[0])
img.size
To crop an image with PIL we have to specify the top/left and bottom/right corner in this format: (left, top, right, bottom). We won't just crop the size we want, but first crop the section we want of the image and then apply a resize. In what follows, we call the first one the crop_size
.
#collapse_show
img.crop((60,60,320,320)).resize((128,128), resample=PIL.Image.BILINEAR)
#collapse
cnr2 = (60,60,320,320)
resample = PIL.Image.BILINEAR
This is pretty fast in PIL:
#collapse
%timeit -n 10 img.crop(cnr2).resize((128,128), resample=resample)
Our time budget: aim for 5 mins per batch for imagenet on 8 GPUs. 1.25m images in imagenet. So on one GPU per minute that's 1250000/8/5 == 31250
, or 520 per second. Assuming 4 cores per GPU, then we want ~125 images per second - so try to stay <10ms per image. Here we have time to do more things. For instance, we can do the crop and resize in the same call to transform
, which will give a smoother result.
#collapse
img.transform((128,128), PIL.Image.EXTENT, cnr2, resample=resample)
#collapse
%timeit -n 10 img.transform((128,128), PIL.Image.EXTENT, cnr2, resample=resample)
It's a little bit slower but still fast enough for our purpose, so we will use this. We then define a general crop transform and two subclasses: one to crop at the center (for validation) and one to randomly crop. Each time, the subclass only implements the way to get the four corners passed to PIL.
#collapse_show
from random import randint
def process_sz(sz):
sz = listify(sz)
return tuple(sz if len(sz)==2 else [sz[0],sz[0]])
def default_crop_size(w,h): return [w,w] if w < h else [h,h]
class GeneralCrop(PilTransform):
def __init__(self, size, crop_size=None, resample=PIL.Image.BILINEAR):
self.resample,self.size = resample,process_sz(size)
self.crop_size = None if crop_size is None else process_sz(crop_size)
def default_crop_size(self, w,h): return default_crop_size(w,h)
def __call__(self, x):
csize = self.default_crop_size(*x.size) if self.crop_size is None else self.crop_size
return x.transform(self.size, PIL.Image.EXTENT, self.get_corners(*x.size, *csize), resample=self.resample)
def get_corners(self, w, h): return (0,0,w,h)
class CenterCrop(GeneralCrop):
def __init__(self, size, scale=1.14, resample=PIL.Image.BILINEAR):
super().__init__(size, resample=resample)
self.scale = scale
def default_crop_size(self, w,h): return [w/self.scale,h/self.scale]
def get_corners(self, w, h, wc, hc):
return ((w-wc)//2, (h-hc)//2, (w-wc)//2+wc, (h-hc)//2+hc)
#collapse_show
il1.tfms = [make_rgb, CenterCrop(128), to_byte_tensor, to_float_tensor]
#collapse_show
show_batch(next(iter(dl)))
This is the usual data augmentation used on ImageNet (introduced here) that consists of selecting 8 to 100% of the image area and a scale between 3/4 and 4/3 as a crop, then resizing it to the desired size. It combines some zoom and a bit of squishing at a very low computational cost.
#collapse_show
class RandomResizedCrop(GeneralCrop):
def __init__(self, size, scale=(0.08,1.0), ratio=(3./4., 4./3.), resample=PIL.Image.BILINEAR):
super().__init__(size, resample=resample)
self.scale,self.ratio = scale,ratio
def get_corners(self, w, h, wc, hc):
area = w*h
#Tries 10 times to get a proper crop inside the image.
for attempt in range(10):
area = random.uniform(*self.scale) * area
ratio = math.exp(random.uniform(math.log(self.ratio[0]), math.log(self.ratio[1])))
new_w = int(round(math.sqrt(area * ratio)))
new_h = int(round(math.sqrt(area / ratio)))
if new_w <= w and new_h <= h:
left = random.randint(0, w - new_w)
top = random.randint(0, h - new_h)
return (left, top, left + new_w, top + new_h)
# Fallback to squish
if w/h < self.ratio[0]: size = (w, int(w/self.ratio[0]))
elif w/h > self.ratio[1]: size = (int(h*self.ratio[1]), h)
else: size = (w, h)
return ((w-size[0])//2, (h-size[1])//2, (w+size[0])//2, (h+size[1])//2)
#collapse_show
il1.tfms = [make_rgb, RandomResizedCrop(128), to_byte_tensor, to_float_tensor]
#collapse_show
show_batch(next(iter(dl)))
To do perspective warping, we map the corners of the image to new points: for instance, if we want to tilt the image so that the top looks closer to us, the top/left corner needs to be shifted to the right and the top/right to the left. To avoid squishing, the bottom/left corner needs to be shifted to the left and the bottom/right corner to the right. For instance, if we have an image with corners in:
(60,60,60,280,280,280,280,60)
(top/left, bottom/left, bottom/right, top/right) then a warped version is:
(90,60,30,280,310,280,250,60)
PIL can do this for us but it requires 8 coefficients we need to calculate. The math isn't the most important here, as we've done it for you. We need to solve this equation. The equation solver is called torch.solve
in PyTorch.
#collapse_show
from torch import FloatTensor,LongTensor
def find_coeffs(orig_pts, targ_pts):
matrix = []
#The equations we'll need to solve.
for p1, p2 in zip(targ_pts, orig_pts):
matrix.append([p1[0], p1[1], 1, 0, 0, 0, -p2[0]*p1[0], -p2[0]*p1[1]])
matrix.append([0, 0, 0, p1[0], p1[1], 1, -p2[1]*p1[0], -p2[1]*p1[1]])
A = FloatTensor(matrix)
B = FloatTensor(orig_pts).view(8, 1)
#The 8 scalars we seek are solution of AX = B
return list(torch.solve(B,A)[0][:,0])
#collapse_show
def warp(img, size, src_coords, resample=PIL.Image.BILINEAR):
w,h = size
targ_coords = ((0,0),(0,h),(w,h),(w,0))
c = find_coeffs(src_coords,targ_coords)
res = img.transform(size, PIL.Image.PERSPECTIVE, list(c), resample=resample)
return res
#collapse_show
targ = ((0,0),(0,128),(128,128),(128,0))
src = ((90,60),(30,280),(310,280),(250,60))
#collapse_show
c = find_coeffs(src, targ)
img.transform((128,128), PIL.Image.PERSPECTIVE, list(c), resample=resample)
#collapse
%timeit -n 10 warp(img, (128,128), src)
#collapse
%timeit -n 10 warp(img, (128,128), src, resample=PIL.Image.NEAREST)
#collapse
warp(img, (64,64), src, resample=PIL.Image.BICUBIC)
#collapse
warp(img, (64,64), src, resample=PIL.Image.NEAREST)
#collapse_show
def uniform(a,b): return a + (b-a) * random.random()
We can add a transform to do this perspective warping automatically with the rand resize and crop.
#collapse_show
class PilTiltRandomCrop(PilTransform):
def __init__(self, size, crop_size=None, magnitude=0., resample=PIL.Image.NEAREST):
self.resample,self.size,self.magnitude = resample,process_sz(size),magnitude
self.crop_size = None if crop_size is None else process_sz(crop_size)
def __call__(self, x):
csize = default_crop_size(*x.size) if self.crop_size is None else self.crop_size
up_t,lr_t = uniform(-self.magnitude, self.magnitude),uniform(-self.magnitude, self.magnitude)
left,top = randint(0,x.size[0]-csize[0]),randint(0,x.size[1]-csize[1])
src_corners = tensor([[-up_t, -lr_t], [up_t, 1+lr_t], [1-up_t, 1-lr_t], [1+up_t, lr_t]])
src_corners = src_corners * tensor(csize).float() + tensor([left,top]).float()
src_corners = tuple([(int(o[0].item()), int(o[1].item())) for o in src_corners])
return warp(x, self.size, src_corners, resample=self.resample)
#collapse_show
il1.tfms = [make_rgb, PilTiltRandomCrop(128, magnitude=0.1), to_byte_tensor, to_float_tensor]
#collapse_show
x = next(iter(dl))
show_batch(x)
Problem is that black padding appears as soon as our target points are outside of the image, so we have to limit the magnitude if we want to avoid that.
#collapse_show
class PilTiltRandomCrop(PilTransform):
def __init__(self, size, crop_size=None, magnitude=0., resample=PIL.Image.BILINEAR):
self.resample,self.size,self.magnitude = resample,process_sz(size),magnitude
self.crop_size = None if crop_size is None else process_sz(crop_size)
def __call__(self, x):
csize = default_crop_size(*x.size) if self.crop_size is None else self.crop_size
left,top = randint(0,x.size[0]-csize[0]),randint(0,x.size[1]-csize[1])
top_magn = min(self.magnitude, left/csize[0], (x.size[0]-left)/csize[0]-1)
lr_magn = min(self.magnitude, top /csize[1], (x.size[1]-top) /csize[1]-1)
up_t,lr_t = uniform(-top_magn, top_magn),uniform(-lr_magn, lr_magn)
src_corners = tensor([[-up_t, -lr_t], [up_t, 1+lr_t], [1-up_t, 1-lr_t], [1+up_t, lr_t]])
src_corners = src_corners * tensor(csize).float() + tensor([left,top]).float()
src_corners = tuple([(int(o[0].item()), int(o[1].item())) for o in src_corners])
return warp(x, self.size, src_corners, resample=self.resample)
#collapse_show
il1.tfms = [make_rgb, PilTiltRandomCrop(128, 200, magnitude=0.2), to_byte_tensor, to_float_tensor]
#collapse_show
[(o._order,o) for o in sorted(tfms, key=operator.attrgetter('_order'))]
#collapse_show
import numpy as np
def np_to_float(x): return torch.from_numpy(np.array(x, dtype=np.float32, copy=False)).permute(2,0,1).contiguous()/255.
np_to_float._order = 30
It is actually faster to combine to_float_tensor
and to_byte_tensor
in one transform using numpy.
#collapse
%timeit -n 10 to_float_tensor(to_byte_tensor(img))
#collapse
%timeit -n 10 np_to_float(img)
You can write your own augmentation for your domain's data types, and have them run on the GPU, by using regular PyTorch tensor operations. Here's an example for images. The key is to do them on a whole batch at a time. Nearly all PyTorch operations can be done batch-wise.
Once we have resized our images so that we can batch them together, we can apply more data augmentation on a batch level. For the affine/coord transforms, we proceed like this:
- generate a grid map of the size of our batch (bs x height x width x 2) that contains the coordinates of a grid of size height x width (this will be the final size of the image, and doesn't have to be the same as the current size in the batch)
- apply the affine transforms (which is a matrix multiplication) and the coord transforms to that grid map
- interpolate the values of the final pixels we want from the initial images in the batch, according to the transformed grid map
For 1. and 3. there are PyTorch functions: F.affine_grid
and F.grid_sample
. F.affine_grid
can even combine 1 and 2 if we just want to do an affine transformation.
#collapse
il1.tfms = [make_rgb, PilTiltRandomCrop(128, magnitude=0.2), to_byte_tensor, to_float_tensor]
#collapse
dl = DataLoader(il1, 64)
#collapse
x = next(iter(dl))
#collapse
from torch import FloatTensor
#collapse_show
def affine_grid_cpu(size):
N, C, H, W = size
grid = FloatTensor(N, H, W, 2)
linear_points = torch.linspace(-1, 1, W) if W > 1 else tensor([-1])
grid[:, :, :, 0] = torch.ger(torch.ones(H), linear_points).expand_as(grid[:, :, :, 0])
linear_points = torch.linspace(-1, 1, H) if H > 1 else tensor([-1])
grid[:, :, :, 1] = torch.ger(linear_points, torch.ones(W)).expand_as(grid[:, :, :, 1])
return grid
#collapse_show
grid = affine_grid_cpu(x.size())
#collapse_show
grid.shape
#collapse_show
grid[0,:5,:5]
#collapse
%timeit -n 10 grid = affine_grid_cpu(x.size())
Coords in the grid go from -1, to 1 (PyTorch convention).
PyTorch version is slower on the CPU but optimized to go very fast on the GPU
#collapse_show
m = tensor([[1., 0., 0.], [0., 1., 0.]])
theta = m.expand(x.size(0), 2, 3)
#collapse_show
theta.shape
#collapse_show
%timeit -n 10 grid = F.affine_grid(theta, x.size())
#collapse_show
%timeit -n 10 grid = F.affine_grid(theta.cuda(), x.size())
So we write our own version that dispatches on the CPU with our function and uses PyTorch's on the GPU.
#collapse_show
def affine_grid(x, size):
size = (size,size) if isinstance(size, int) else tuple(size)
size = (x.size(0),x.size(1)) + size
if x.device.type == 'cpu': return affine_grid_cpu(size)
m = tensor([[1., 0., 0.], [0., 1., 0.]], device=x.device)
return F.affine_grid(m.expand(x.size(0), 2, 3), size)
#collapse_show
grid = affine_grid(x, 128)
In 2D an affine transformation has the form y = Ax + b where A is a 2x2 matrix and b a vector with 2 coordinates. It's usually represented by the 3x3 matrix
A[0,0] A[0,1] b[0]
A[1,0] A[1,1] b[1]
0 0 1
because then the composition of two affine transforms can be computed with the matrix product of their 3x3 representations.
#collapse_show
from torch import stack,zeros_like,ones_like
The matrix for a rotation that has an angle of theta
is:
cos(theta) -sin(theta) 0
sin(theta) cos(theta) 0
0 0 1
Here we have to apply the reciprocal of a regular rotation (exercise: find why!) so we use this matrix:
cos(theta) sin(theta) 0
-sin(theta) cos(theta) 0
0 0 1
then we draw a different theta
for each version of the image in the batch to return a batch of rotation matrices (size bs x 3 x 3
).
#collapse_show
def rotation_matrix(thetas):
thetas.mul_(math.pi/180)
rows = [stack([thetas.cos(), thetas.sin(), torch.zeros_like(thetas)], dim=1),
stack([-thetas.sin(), thetas.cos(), torch.zeros_like(thetas)], dim=1),
stack([torch.zeros_like(thetas), torch.zeros_like(thetas), torch.ones_like(thetas)], dim=1)]
return stack(rows, dim=1)
#collapse
thetas = torch.empty(x.size(0)).uniform_(-30,30)
#collapse
thetas[:5]
#collapse
m = rotation_matrix(thetas)
#collapse
m.shape, m[:,None].shape, grid.shape
#collapse
grid.view(64,-1,2).shape
We have to apply our rotation to every point in the grid. The matrix a is given by the first two rows and two columns of m
and the vector b
is the first two coefficients of the last column. Of course we have to deal with the fact that here m
is a batch of matrices.
#collapse
a = m[:,:2,:2]
b = m[:, 2:,:2]
tfm_grid = (grid.view(64,-1,2) @ a + b).view(64, 128, 128, 2)
We can also do this without the view
by using broadcasting.
#collapse
%timeit -n 10 tfm_grid = grid @ m[:,None,:2,:2] + m[:,2,:2][:,None,None]
#collapse
%timeit -n 10 tfm_grid = torch.einsum('bijk,bkl->bijl', grid, m[:,:2,:2]) + m[:,2,:2][:,None,None]
#collapse
%timeit -n 10 tfm_grid = torch.matmul(grid, m[:,:2,:2].unsqueeze(1)) + m[:,2,:2][:,None,None]
#collapse
%timeit -n 10 tfm_grid = (torch.bmm(grid.view(64,-1,2), m[:,:2,:2]) + m[:,2,:2][:,None]).view(-1, 128, 128, 2)
And on the GPU
#collapse
grid = grid.cuda()
m = m.cuda()
#collapse
%timeit -n 10 tfm_grid = grid @ m[:,None,:2,:2] + m[:,2,:2][:,None,None]
#collapse
%timeit -n 10 tfm_grid = torch.einsum('bijk,bkl->bijl', grid, m[:,:2,:2]) + m[:,2,:2][:,None,None]
#collapse
%timeit -n 10 tfm_grid = torch.matmul(grid, m[:,:2,:2].unsqueeze(1)) + m[:,2,:2][:,None,None]
#collapse
%timeit -n 10 tfm_grid = (torch.bmm(grid.view(64,-1,2), m[:,:2,:2]) + m[:,2,:2][:,None]).view(-1, 128, 128, 2)
Since bmm
is always the fastest, we use this one for the matrix multiplication.
#collapse_show
tfm_grid = torch.bmm(grid.view(64,-1,2), m[:,:2,:2]).view(-1, 128, 128, 2)
The interpolation to find our coordinates back is done by grid_sample
.
#collapse_show
tfm_x = F.grid_sample(x, tfm_grid.cpu())
#collapse_show
show_batch(tfm_x, r=2)
It takes a padding_mode
argument.
#collapse_show
tfm_x = F.grid_sample(x, tfm_grid.cpu(), padding_mode='reflection')
#collapse_show
show_batch(tfm_x, r=2)
Let's look at the speed now!
#collapse_show
def rotate_batch(x, size, degrees):
grid = affine_grid(x, size)
thetas = x.new(x.size(0)).uniform_(-degrees,degrees)
m = rotation_matrix(thetas)
tfm_grid = grid @ m[:,:2,:2].unsqueeze(1) + m[:,2,:2][:,None,None]
return F.grid_sample(x, tfm_grid)
#collapse
show_batch(rotate_batch(x, 128, 30), r=2)
#collapse
%timeit -n 10 tfm_x = rotate_batch(x, 128, 30)
#collapse
%timeit -n 10 tfm_x = rotate_batch(x.cuda(), 128, 30)
Not bad for 64 rotations!
But we can be even faster!
#collapse
from torch import Tensor
#collapse_show
from torch.jit import script
@script
def rotate_batch(x:Tensor, size:int, degrees:float) -> Tensor:
sz = (x.size(0),x.size(1)) + (size,size)
idm = torch.zeros(2,3, device=x.device)
idm[0,0] = 1.
idm[1,1] = 1.
grid = F.affine_grid(idm.expand(x.size(0), 2, 3), sz)
thetas = torch.zeros(x.size(0), device=x.device).uniform_(-degrees,degrees)
m = rotation_matrix(thetas)
tfm_grid = torch.matmul(grid, m[:,:2,:2].unsqueeze(1)) + m[:,2,:2].unsqueeze(1).unsqueeze(2)
return F.grid_sample(x, tfm_grid)
#collapse
m = tensor([[1., 0., 0.], [0., 1., 0.]], device=x.device)
#collapse
%timeit -n 10 tfm_x = rotate_batch(x.cuda(), 128, 30)
The speed of this depends a lot on what card you have. On a V100 it is generally about 3x faster than non-JIT (as at April 2019) although PyTorch JIT is rapidly improving.
And even faster if we give the matrix rotation to affine_grid
.
#collapse_show
def rotate_batch(x, size, degrees):
size = (size,size) if isinstance(size, int) else tuple(size)
size = (x.size(0),x.size(1)) + size
thetas = x.new(x.size(0)).uniform_(-degrees,degrees)
m = rotation_matrix(thetas)
grid = F.affine_grid(m[:,:2], size)
return F.grid_sample(x.cuda(), grid)
#collapse
%timeit -n 10 tfm_x = rotate_batch(x.cuda(), 128, 30)