Add Steve's VI with NFs curriculum

avital · avital · commit c48f3e92f96c · 2020-05-25T12:01:11.000+02:00
diff --git a/_config.yml b/_config.yml
@@ -51,6 +51,10 @@ riley:
     name: Riley Edmunds
     email: rileyedmunds@gmail.com
     web: https://www.linkedin.com/in/rileyedmunds
+kroon:
+    name: Steve Kroon
+    email: skroon@gmail.com
+    web: https://twitter.com/skroon
 
 # Build settings
 markdown:       kramdown
diff --git a/_posts/2020-03-02-SVGD.markdown b/_posts/2020-03-02-SVGD.markdown
@@ -8,7 +8,7 @@ blurb: "Stein Variational Gradient Descent is a powerful, non-parameteric Bayesi
 feedback: true
 ---
 
-[Editor’s Note: This class was a part of the 2019 DFL Jane Street Fellowship.]
+[Editor’s Note: This class was a part of the 2019 DFL [Jane Street](https://www.janestreet.com/) Fellowship.]
 
 This guide is thanks to a many different people, all of whom took their time to give feedback, write reviews, and provide their own insights to the curriculum.
 
diff --git a/_posts/2020-04-07-Resurrecting-Sigmoid.markdown b/_posts/2020-04-07-Resurrecting-Sigmoid.markdown
@@ -10,7 +10,7 @@ networks with gradient-based methods. This paper studies, from a rigorous theore
 feedback: true
 ---
 
-[Editor’s Note: This class was a part of the 2019 DFL Jane Street Fellowship.]
+[Editor’s Note: This class was a part of the 2019 DFL [Jane Street](https://www.janestreet.com/) Fellowship.]
  
 This guide would not have been possible without the help and feedback from many people. 
 
diff --git a/assets/VI-with-NFs.svg b/assets/VI-with-NFs.svg
@@ -0,0 +1,3 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns="http://www.w3.org/2000/svg" style="background-color: rgb(255, 255, 255);" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="338px" height="554px" viewBox="-0.5 -0.5 338 554"><defs/><g><a xlink:href="#4-inference-networks-and-amortized-vi"><rect x="142" y="142" width="120" height="60" rx="9" ry="9" fill-opacity="0.5" fill="#dae8fc" stroke="#6c8ebf" stroke-opacity="0.5" stroke-width="4" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 172px; margin-left: 143px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 16px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: normal; word-wrap: normal; "><font style="font-size: 16px">Amortized VI</font></div></div></div></foreignObject><text x="202" y="177" fill="#000000" font-family="Helvetica" font-size="16px" text-anchor="middle">Amortized VI</text></switch></g></a><a xlink:href="#1-bayesian-inference-and-latent-variable-models"><rect x="142" y="492" width="120" height="60" rx="9" ry="9" fill-opacity="0.5" fill="#dae8fc" stroke="#6c8ebf" stroke-opacity="0.5" stroke-width="4" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 522px; margin-left: 143px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 16px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: normal; word-wrap: normal; ">Bayesian Inference</div></div></div></foreignObject><text x="202" y="527" fill="#000000" font-family="Helvetica" font-size="16px" text-anchor="middle">Bayesian Infere...</text></switch></g></a><a xlink:href="#5-normalizing-flows"><rect x="2" y="142" width="120" height="60" rx="9" ry="9" fill-opacity="0.5" fill="#dae8fc" stroke="#6c8ebf" stroke-opacity="0.5" stroke-width="4" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 172px; margin-left: 3px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 16px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: normal; word-wrap: normal; "><font style="font-size: 16px">Normalizing Flows</font></div></div></div></foreignObject><text x="62" y="177" fill="#000000" font-family="Helvetica" font-size="16px" text-anchor="middle">Normalizing Flo...</text></switch></g></a><path d="M 62 142 L 62 112 Q 62 102 72 102 L 162 102 Q 172 102 172 92 L 172 70.24" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" stroke-dasharray="6 6" pointer-events="stroke"/><path d="M 172 64.24 L 176 72.24 L 172 70.24 L 168 72.24 Z" fill="#000000" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 202 142 L 202 70.24" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" stroke-dasharray="6 6" pointer-events="stroke"/><path d="M 202 64.24 L 206 72.24 L 202 70.24 L 198 72.24 Z" fill="#000000" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><a xlink:href="#2-introduction-to-variational-inference-vi"><rect x="142" y="392" width="120" height="60" rx="9" ry="9" fill-opacity="0.5" fill="#dae8fc" stroke="#6c8ebf" stroke-opacity="0.5" stroke-width="4" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 422px; margin-left: 143px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 16px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: normal; word-wrap: normal; "><font style="font-size: 16px">Variational Inference</font></div></div></div></foreignObject><text x="202" y="427" fill="#000000" font-family="Helvetica" font-size="16px" text-anchor="middle">Variational Inf...</text></switch></g></a><a xlink:href="#3-doubly-stochastic-estimation-vi-by-monte-carlo-mini-batch-gradient-estimation"><rect x="216" y="262" width="120" height="60" rx="9" ry="9" fill-opacity="0.5" fill="#dae8fc" stroke="#6c8ebf" stroke-opacity="0.5" stroke-width="4" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 292px; margin-left: 217px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 16px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: normal; word-wrap: normal; ">Stochastic VI</div></div></div></foreignObject><text x="276" y="297" fill="#000000" font-family="Helvetica" font-size="16px" text-anchor="middle">Stochastic VI</text></switch></g></a><path d="M 132 262 L 132 242 Q 132 232 142 232 L 162 232 Q 172 232 172 222 L 172 210.24" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" stroke-dasharray="6 6" pointer-events="stroke"/><path d="M 172 204.24 L 176 212.24 L 172 210.24 L 168 212.24 Z" fill="#000000" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 276 262 L 276 242 Q 276 232 266 232 L 242 232 Q 232 232 232 222 L 232 210.24" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" stroke-dasharray="6 6" pointer-events="stroke"/><path d="M 232 204.24 L 236 212.24 L 232 210.24 L 228 212.24 Z" fill="#000000" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 202 392 L 202 372 Q 202 362 192 362 L 142 362 Q 132 362 132 352 L 132 330.24" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" stroke-dasharray="6 6" pointer-events="stroke"/><path d="M 132 324.24 L 136 332.24 L 132 330.24 L 128 332.24 Z" fill="#000000" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><a xlink:href="#6-normalizing-flows-for-variational-inference"><rect x="142" y="2" width="120" height="60" rx="9" ry="9" fill-opacity="0.5" fill="#dae8fc" stroke="#6c8ebf" stroke-opacity="0.5" stroke-width="4" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 32px; margin-left: 143px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 16px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: normal; word-wrap: normal; "><font style="font-size: 16px">NFs for VI</font></div></div></div></foreignObject><text x="202" y="37" fill="#000000" font-family="Helvetica" font-size="16px" text-anchor="middle">NFs for VI</text></switch></g></a><path d="M 212 392 L 212 372 Q 212 362 222 362 L 266 362 Q 276 362 276 352 L 276 330.24" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" stroke-dasharray="6 6" pointer-events="stroke"/><path d="M 276 324.24 L 280 332.24 L 276 330.24 L 272 332.24 Z" fill="#000000" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><a xlink:href="#3-doubly-stochastic-estimation-vi-by-monte-carlo-mini-batch-gradient-estimation"><rect x="52" y="262" width="120" height="60" rx="9" ry="9" fill-opacity="0.5" fill="#dae8fc" stroke="#6c8ebf" stroke-opacity="0.5" stroke-width="4" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 292px; margin-left: 53px;"><div style="box-sizing: border-box; font-size: 0; text-align: center; "><div style="display: inline-block; font-size: 16px; font-family: Helvetica; color: #000000; line-height: 1.2; pointer-events: all; white-space: normal; word-wrap: normal; ">Black Box VI</div></div></div></foreignObject><text x="112" y="297" fill="#000000" font-family="Helvetica" font-size="16px" text-anchor="middle">Black Box VI</text></switch></g></a><path d="M 202 492 L 202 460.24" fill="none" stroke="#000000" stroke-width="2" stroke-miterlimit="10" stroke-dasharray="6 6" pointer-events="stroke"/><path d="M 202 454.24 L 206 462.24 L 202 460.24 L 198 462.24 Z" fill="#000000" stroke="#000000" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/></g><switch><g requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"/><a transform="translate(0,-5)" xlink:href="https://desk.draw.io/support/solutions/articles/16000042487" target="_blank"><text text-anchor="middle" font-size="10px" x="50%" y="100%">Viewer does not support full SVG 1.1</text></a></switch></svg>
diff --git a/assets/vae.py b/assets/vae.py
@@ -0,0 +1,165 @@
+# VAE implementation exercise for Depth First Learning Curriculum: Normalizing Flows for Variational Inference.
+# This is a stripped-down version of the PyTorch VAE example available at https://github.com/pytorch/examples/blob/master/vae/main.py.
+
+from __future__ import print_function
+import argparse
+import torch
+import torch.utils.data
+from torch import nn, optim
+from torch.nn import functional as F
+from torchvision import datasets, transforms
+from torchvision.utils import save_image
+
+
+parser = argparse.ArgumentParser(description='VAE MNIST Example')
+parser.add_argument('--batch-size', type=int, default=128, metavar='N',
+                    help='input batch size for training (default: 128)')
+parser.add_argument('--epochs', type=int, default=10, metavar='N',
+                    help='number of epochs to train (default: 10)')
+parser.add_argument('--no-cuda', action='store_true', default=False,
+                    help='enables CUDA training')
+parser.add_argument('--seed', type=int, default=1, metavar='S',
+                    help='random seed (default: 1)')
+parser.add_argument('--log-interval', type=int, default=10, metavar='N',
+                    help='how many batches to wait before logging training status')
+args = parser.parse_args()
+args.cuda = not args.no_cuda and torch.cuda.is_available()
+
+torch.manual_seed(args.seed)
+
+device = torch.device("cuda" if args.cuda else "cpu")
+
+kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
+train_loader = torch.utils.data.DataLoader(
+    datasets.MNIST('../data', train=True, download=True,
+                   transform=transforms.ToTensor()),
+    batch_size=args.batch_size, shuffle=True, **kwargs)
+test_loader = torch.utils.data.DataLoader(
+    datasets.MNIST('../data', train=False, transform=transforms.ToTensor()),
+    batch_size=args.batch_size, shuffle=True, **kwargs)
+
+
+class VAE(nn.Module):
+    def __init__(self):
+        super(VAE, self).__init__()
+
+        # fc1, fc21 and fc22 are used by the encoder.
+        # fc1 takes a vectorized MNIST image as input
+        # fc21 and fc22 are both attached to the activation output of fc1 (using ReLU).
+        # fc21 outputs the means, and fc22 the log-variances of
+        # each component of th 20-dimensional latent Gaussian.
+        self.fc1 = nn.Linear(784, 400)
+        self.fc21 = nn.Linear(400, 20)
+        self.fc22 = nn.Linear(400, 20)
+        # fc3 and fc4 are connected in series as the decoder.
+        # fc3 takes a realization from the latent space as input
+        # and the decoder generates a vectorized 28x28 image.
+        # The output of fc3 passes through a ReLU,
+        # while fc4 uses a sigmoid in order to output a probability for each pixel
+        self.fc3 = nn.Linear(20, 400)
+        self.fc4 = nn.Linear(400, 784)
+
+    # TODO: Implement the following four functions.  Note that they should be able to accept arguments containing stacked information for multiple observations
+    # e.g. a minibatch rather than a single observation.  Your solution will need to handle this.  If you treat the arguments as
+    # representing a single observation in your logic, in most cases broadcasting will do the rest of the job automatically for you.
+    def encode(self, x):
+        # This should return the outputs of fc21 and fc22 as a tuple
+        pass
+
+    def reparameterize(self, mu, logvar):
+        # This should sample vectors from an isotropic Gaussian, and use these to generate
+        # and return observations with a mean vectors from mu, and log-variances of log-var
+        pass
+
+    def decode(self, z):
+        # Pass z through the decoder. For each 20-dimensional latent realization, there should be a 784-dimensional vector of
+        #probabilities generated, one per pixel
+        pass
+
+    def forward(self, x):
+        # For each observation in x:
+        # 1. Pass it through the encoder to get predicted variational distribution parameters
+        # 2. Reparameterize an isotropic Gaussian with these parameters to get sample latent variable realizations
+        # 3. Pass the realization through the encoder to get predicted pixel probabilities
+        # Return a tuple with 3 elements: (a) the predicted pixel probabilities, (b) the predicted variational means, and (c) the predicted variational log-variances
+        x = x.view(-1,784) # Reshape x to provide suitable inputs to the encoder
+        pass
+
+model = VAE().to(device)
+optimizer = optim.Adam(model.parameters(), lr=1e-3)
+
+# TODO: Implement this loss function
+def loss_function(recon_x, x, mu, logvar):
+    # The loss should be (an estimate of) the negative ELBO - remember we wish to maximise the ELBO - but the ELBO can be written in a number of forms.
+    # In this case, the prior for the latent variable and the variational posterior are both Gaussians, and we will exploit this.
+    # Specifically, we can analytically calculate a part of the ELBO, and only use Monte Carlo estimation for the rest.
+    # 1. We use the form of the ELBO which includes a KL divergence between the latent prior and the variational family
+    # - see the form at the bottom of page 6 of Blei et al's "Variational Inference: A Review for Statisticians".
+    # 2. In this case, the expression for the relevant KL divergence can be obtained from Exercise (e) in Week 1.
+    #
+    # The other term is the expected conditional log-likelihood, which is estimated using a single Monte-Carlo sample.
+    # For the log-likelihood, one evaluates the probability of observing an input point given the "conditional distribution" for
+    # observations output by the network - in this case, each pixel is independently Bernoulli with parameter equal to the output probability.
+    # You may find torch.nn.functional's binary_cross_entropy function useful here.
+    #
+    # Additional: the extraction of the KL divergence as above reduces the variance.  Investigate the effect of directly estimating
+    # the full ELBO term for each observation with a single Monte Carlo sample.
+    #
+    # You may find torch.nn.functional's binary_cross_entropy function useful.
+    #
+    # Return a single value accumulating the loss over the whole batch.
+    #
+    # Arguments:
+    # x is the batch of observations
+    # recon_x, mu, and logvar are the outputs of forward(x) (above) - see the usage below
+    x = x.view(-1,784) # Reshape x to provide suitable inputs to the encoder
+    pass
+
+def train(epoch):
+    model.train()
+    train_loss = 0
+    for batch_idx, (data, _) in enumerate(train_loader):
+        data = data.to(device)
+        optimizer.zero_grad()
+        recon_batch, mu, logvar = model(data)
+        loss = loss_function(recon_batch, data, mu, logvar)
+        loss.backward()
+        train_loss += loss.item()
+        optimizer.step()
+        if batch_idx % args.log_interval == 0:
+            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
+                epoch, batch_idx * len(data), len(train_loader.dataset),
+                100. * batch_idx / len(train_loader),
+                loss.item() / len(data)))
+
+    print('====> Epoch: {} Average loss: {:.4f}'.format(
+          epoch, train_loss / len(train_loader.dataset)))
+
+
+def test(epoch):
+    model.eval()
+    test_loss = 0
+    with torch.no_grad():
+        for i, (data, _) in enumerate(test_loader):
+            data = data.to(device)
+            recon_batch, mu, logvar = model(data)
+            test_loss += loss_function(recon_batch, data, mu, logvar).item()
+            if i == 0:
+                n = min(data.size(0), 8)
+                comparison = torch.cat([data[:n],
+                                      recon_batch.view(args.batch_size, 1, 28, 28)[:n]])
+                save_image(comparison.cpu(),
+                         'results/reconstruction_' + str(epoch) + '.png', nrow=n)
+
+    test_loss /= len(test_loader.dataset)
+    print('====> Test set loss: {:.4f}'.format(test_loss))
+
+if __name__ == "__main__":
+    for epoch in range(1, args.epochs + 1):
+        train(epoch)
+        test(epoch)
+        with torch.no_grad():
+            sample = torch.randn(64, 20).to(device)
+            sample = model.decode(sample).cpu()
+            save_image(sample.view(64, 1, 28, 28),
+                       'results/sample_' + str(epoch) + '.png')