From b648259e62b4f2da0df86718701d0e908fdfb726 Mon Sep 17 00:00:00 2001
From: JHopeCollins <joshua.hope-collins13@imperial.ac.uk>
Date: Thu, 4 May 2023 09:01:34 +0100
Subject: [PATCH 1/9] deal with testing for linear paradiag snes properly

---
 asQ/paradiag.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/asQ/paradiag.py b/asQ/paradiag.py
index 19025aeb..1a7f2fd5 100644
--- a/asQ/paradiag.py
+++ b/asQ/paradiag.py
@@ -204,7 +204,8 @@ def solve(self,
             postproc(self, wndw)
 
             converged_reason = self.snes.getConvergedReason()
-            is_linear = self.flat_solver_parameters['snes_type'] == 'ksponly'
+            is_linear = ('snes_type' in self.flat_solver_parameters
+                         and self.flat_solver_parameters['snes_type'] == 'ksponly')
             if is_linear and (converged_reason == 5):
                 pass
             elif not (1 < converged_reason < 5):

From 5dc06267da979b88660810c7240e53a0c96fbb5d Mon Sep 17 00:00:00 2001
From: JHopeCollins <joshua.hope-collins13@imperial.ac.uk>
Date: Thu, 4 May 2023 09:10:09 +0100
Subject: [PATCH 2/9] factor constructing riesz map solve out of pc constructor

---
 asQ/diag_preconditioner.py                    | 113 +++++++++++-------
 .../shallow_water/linear_gravity_bumps.py     |   4 +-
 2 files changed, 69 insertions(+), 48 deletions(-)

diff --git a/asQ/diag_preconditioner.py b/asQ/diag_preconditioner.py
index 189f11ea..fb0c6239 100644
--- a/asQ/diag_preconditioner.py
+++ b/asQ/diag_preconditioner.py
@@ -10,6 +10,64 @@
 import asQ.complex_proxy.vector as cpx
 
 
+def construct_riesz_map(V, W, prefix, riesz_options=None):
+    """
+    Construct projection into W assuming W is a complex-proxy
+    FunctionSpace for the real FunctionSpace V.
+
+    :arg V: a real-valued FunctionSpace.
+    :arg W: a complex-proxy FunctionSpace for V.
+    :arg prefix: the prefix for the PETSc options for the projection solve.
+    :arg riesz_options: PETSc options for the projection solve. Defaults to direct solve.
+    """
+    # default is to solve directly
+    if riesz_options is None:
+        riesz_options = {
+            'ksp_type': 'preonly',
+            'pc_type': 'lu',
+            'pc_factor_mat_solver_type': 'mumps',
+            'mat_type': 'aij'
+        }
+
+    # mixed mass matrices are decoupled so solve seperately
+    if isinstance(V.ufl_element(), fd.MixedElement):
+        full_riesz_options = {
+            'ksp_type': 'preonly',
+            'mat_type': 'nest',
+            'pc_type': 'fieldsplit',
+            'pc_field_split_type': 'additive',
+            'fieldsplit': riesz_options
+        }
+    else:
+        full_riesz_options = riesz_options
+
+    # mat types
+    mat_type = PETSc.Options().getString(
+        f"{prefix}mat_type",
+        default=riesz_options['mat_type'])
+
+    sub_mat_type = PETSc.Options().getString(
+        f"{prefix}fieldsplit_mat_type",
+        default=riesz_options['mat_type'])
+
+    # input for riesz map
+    rhs = fd.Function(W)
+
+    # construct forms
+    v = fd.TestFunction(W)
+    u = fd.TrialFunction(W)
+
+    a = fd.assemble(fd.inner(u, v)*fd.dx,
+                    mat_type=mat_type,
+                    sub_mat_type=sub_mat_type)
+
+    # create LinearSolver
+    rmap = fd.LinearSolver(a, solver_parameters=full_riesz_options,
+                           options_prefix=f"{prefix}")
+
+    return rmap, rhs
+
+
 class DiagFFTPC(object):
     prefix = "diagfft_"
 
@@ -142,48 +200,11 @@ def initialize(self, pc):
         self.a1 = np.zeros(self.p1.subshape, complex)
         self.transfer = self.p0.transfer(self.p1, complex)
 
-        # setting up the Riesz map
-        default_riesz_method = {
-            'ksp_type': 'preonly',
-            'pc_type': 'lu',
-            'pc_factor_mat_solver_type': 'mumps',
-            'mat_type': 'aij'
-        }
-
-        # mixed mass matrices are decoupled so solve seperately
-        if isinstance(self.blockV.ufl_element(), fd.MixedElement):
-            default_riesz_parameters = {
-                'ksp_type': 'preonly',
-                'mat_type': 'nest',
-                'pc_type': 'fieldsplit',
-                'pc_field_split_type': 'additive',
-                'fieldsplit': default_riesz_method
-            }
-        else:
-            default_riesz_parameters = default_riesz_method
-
-        # we need to pass the mat_types to assemble directly because
-        # it won't pick them up from Options
-
-        riesz_mat_type = PETSc.Options().getString(
-            f"{prefix}{self.prefix}mass_mat_type",
-            default=default_riesz_parameters['mat_type'])
-
-        riesz_sub_mat_type = PETSc.Options().getString(
-            f"{prefix}{self.prefix}mass_fieldsplit_mat_type",
-            default=default_riesz_method['mat_type'])
-
-        # input for the Riesz map
-        self.xtemp = fd.Function(self.CblockV)
-        v = fd.TestFunction(self.CblockV)
-        u = fd.TrialFunction(self.CblockV)
-
-        a = fd.assemble(fd.inner(u, v)*fd.dx,
-                        mat_type=riesz_mat_type,
-                        sub_mat_type=riesz_sub_mat_type)
-
-        self.Proj = fd.LinearSolver(a, solver_parameters=default_riesz_parameters,
-                                    options_prefix=f"{prefix}{self.prefix}mass_")
+        # setting up the Riesz map to project residual into complex space
+        rmap_rhs = construct_riesz_map(self.blockV,
+                                       self.CblockV,
+                                       prefix=f"{prefix}{self.prefix}mass_")
+        self.riesz_proj, self.riesz_rhs = rmap_rhs
 
         # building the Jacobian of the nonlinear term
         # what we want is a block diagonal matrix in the 2x2 system
@@ -336,14 +357,14 @@ def apply(self, pc, x, y):
 
         for i in range(self.aaos.nlocal_timesteps):
             # copy the data into solver input
-            self.xtemp.assign(0.)
+            self.riesz_rhs.assign(0.)
 
-            cpx.set_real(self.xtemp, self.aaos.get_field_components(i, f_alls=self.xfr.subfunctions))
-            cpx.set_imag(self.xtemp, self.aaos.get_field_components(i, f_alls=self.xfi.subfunctions))
+            cpx.set_real(self.riesz_rhs, self.aaos.get_field_components(i, f_alls=self.xfr.subfunctions))
+            cpx.set_imag(self.riesz_rhs, self.aaos.get_field_components(i, f_alls=self.xfi.subfunctions))
 
             # Do a project for Riesz map, to be superceded
             # when we get Cofunction
-            self.Proj.solve(self.Jprob_in, self.xtemp)
+            self.riesz_proj.solve(self.Jprob_in, self.riesz_rhs)
 
             # solve the block system
             self.Jprob_out.assign(0.)
diff --git a/examples/shallow_water/linear_gravity_bumps.py b/examples/shallow_water/linear_gravity_bumps.py
index 7a1e4aa8..42252c83 100644
--- a/examples/shallow_water/linear_gravity_bumps.py
+++ b/examples/shallow_water/linear_gravity_bumps.py
@@ -115,8 +115,8 @@
 }
 
 sparameters_diag['diagfft_block'] = sparameters
-#for i in range(window_length):
-#    sparameters_diag['diagfft_block_'+str(i)+'_'] = sparameters
+# for i in range(window_length):
+#     sparameters_diag['diagfft_block_'+str(i)+'_'] = sparameters
 
 create_mesh = partial(
     swe.create_mg_globe_mesh,

From c8cc9a68417b553ea57652917b12280bb8782a52 Mon Sep 17 00:00:00 2001
From: JHopeCollins <joshua.hope-collins13@imperial.ac.uk>
Date: Thu, 4 May 2023 13:44:46 +0100
Subject: [PATCH 3/9] various renames and reducing copies in diagfftpc

---
 asQ/diag_preconditioner.py | 89 ++++++++++++++++++++------------------
 1 file changed, 48 insertions(+), 41 deletions(-)

diff --git a/asQ/diag_preconditioner.py b/asQ/diag_preconditioner.py
index fb0c6239..b556fade 100644
--- a/asQ/diag_preconditioner.py
+++ b/asQ/diag_preconditioner.py
@@ -138,8 +138,8 @@ def initialize(self, pc):
         assert (self.blockV.dim()*paradiag.nlocal_timesteps == W_all.dim())
 
         # Input/Output wrapper Functions for all-at-once residual being acted on
-        self.xf = fd.Function(W_all)  # input
-        self.yf = fd.Function(W_all)  # output
+        self.xfunc = fd.Function(W_all)  # input
+        self.yfunc = fd.Function(W_all)  # output
 
         # Gamma coefficients
         exponents = np.arange(self.ntimesteps)/self.ntimesteps
@@ -158,8 +158,8 @@ def initialize(self, pc):
         C1col[:2] = np.array([1, -1])/dt
         C2col[:2] = np.array([theta, 1-theta])
 
-        self.D1 = np.sqrt(self.ntimesteps)*fft(self.Gam*C1col)
-        self.D2 = np.sqrt(self.ntimesteps)*fft(self.Gam*C2col)
+        self.D1 = fft(self.Gam*C1col, norm='backward')
+        self.D2 = fft(self.Gam*C2col, norm='backward')
 
         # Block system setup
         # First need to build the vector function space version of blockV
@@ -177,12 +177,12 @@ def initialize(self, pc):
             self.uwrk = fd.Function(self.blockV)
 
         # input and output functions to the block solve
-        self.Jprob_in = fd.Function(self.CblockV)
-        self.Jprob_out = fd.Function(self.CblockV)
+        self.block_rhs = fd.Function(self.CblockV)
+        self.block_sol = fd.Function(self.CblockV)
 
         # A place to store the real/imag components of the all-at-once residual after fft
-        self.xfi = fd.Function(W_all)
-        self.xfr = fd.Function(W_all)
+        self.xreal = fd.Function(W_all)
+        self.ximag = fd.Function(W_all)
 
         # setting up the FFT stuff
         # construct simply dist array and 1d fftn:
@@ -199,6 +199,7 @@ def initialize(self, pc):
         # a0 is the local part of our other fft working array
         self.a1 = np.zeros(self.p1.subshape, complex)
         self.transfer = self.p0.transfer(self.p1, complex)
+        self.cwrk = np.zeros((self.nlocal_timesteps, nlocal), dtype=complex)
 
         # setting up the Riesz map to project residual into complex space
         rmap_rhs = construct_riesz_map(self.blockV,
@@ -235,7 +236,7 @@ def initialize(self, pc):
 
             # The rhs
             v = fd.TestFunction(self.CblockV)
-            L = fd.inner(v, self.Jprob_in)*fd.dx
+            L = fd.inner(v, self.block_rhs)*fd.dx
 
             # pass sigma into PC:
             sigma = self.D1[ii]**2/self.D2[ii]
@@ -260,7 +261,7 @@ def initialize(self, pc):
                     block_prefix = f"{block_prefix}{str(ii)}_"
                     break
 
-            jprob = fd.LinearVariationalProblem(A, L, self.Jprob_out,
+            jprob = fd.LinearVariationalProblem(A, L, self.block_sol,
                                                 bcs=self.CblockV_bcs)
             Jsolver = fd.LinearVariationalSolver(jprob,
                                                  appctx=appctx_h,
@@ -322,13 +323,15 @@ def apply(self, pc, x, y):
 
         # copy petsc vec into Function
         # hopefully this works
-        with self.xf.dat.vec_wo as v:
+        with self.xfunc.dat.vec_wo as v:
             x.copy(v)
 
         # get array of basis coefficients
-        with self.xf.dat.vec_ro as v:
-            parray = v.array_r.reshape((self.aaos.nlocal_timesteps,
-                                        self.blockV.node_set.size))
+        with self.xfunc.dat.vec_ro as v:
+            self.cwrk.real[:] = v.array_r.reshape((self.aaos.nlocal_timesteps,
+                                                   self.blockV.node_set.size))
+            self.cwrk.imag[:] = 0
+
         # This produces an array whose rows are time slices
         # and columns are finite element basis coefficients
 
@@ -336,21 +339,24 @@ def apply(self, pc, x, y):
         # Diagonalise - scale, transfer, FFT, transfer, Copy
         # Scale
         # is there a better way to do this with broadcasting?
-        parray = (1.0+0.j)*(self.Gam_slice*parray.T).T*np.sqrt(self.ntimesteps)
+
+        self.cwrk[:] = (1.0+0.j)*(self.Gam_slice*self.cwrk.T).T
+
         # transfer forward
-        self.a0[:] = parray[:]
+        self.a0[:] = self.cwrk[:]
         self.transfer.forward(self.a0, self.a1)
         # FFT
         self.a1[:] = fft(self.a1, axis=0)
 
         # transfer backward
         self.transfer.backward(self.a1, self.a0)
-        # Copy into xfi, xfr
-        parray[:] = self.a0[:]
-        with self.xfr.dat.vec_wo as v:
-            v.array[:] = parray.real.reshape(-1)
-        with self.xfi.dat.vec_wo as v:
-            v.array[:] = parray.imag.reshape(-1)
+        # Copy into ximag, xreal
+        self.cwrk[:] = self.a0[:]
+
+        with self.xreal.dat.vec_wo as v:
+            v.array[:] = self.cwrk.real.reshape(-1)
+        with self.ximag.dat.vec_wo as v:
+            v.array[:] = self.cwrk.imag.reshape(-1)
         #####################
 
         # Do the block solves
@@ -359,44 +365,45 @@ def apply(self, pc, x, y):
             # copy the data into solver input
             self.riesz_rhs.assign(0.)
 
-            cpx.set_real(self.riesz_rhs, self.aaos.get_field_components(i, f_alls=self.xfr.subfunctions))
-            cpx.set_imag(self.riesz_rhs, self.aaos.get_field_components(i, f_alls=self.xfi.subfunctions))
+            cpx.set_real(self.riesz_rhs, self.aaos.get_field_components(i, f_alls=self.xreal.subfunctions))
+            cpx.set_imag(self.riesz_rhs, self.aaos.get_field_components(i, f_alls=self.ximag.subfunctions))
 
             # Do a project for Riesz map, to be superceded
             # when we get Cofunction
-            self.riesz_proj.solve(self.Jprob_in, self.riesz_rhs)
+            self.riesz_proj.solve(self.block_rhs, self.riesz_rhs)
 
             # solve the block system
-            self.Jprob_out.assign(0.)
+            self.block_sol.assign(0.)
             self.Jsolvers[i].solve()
 
             # copy the data from solver output
-            cpx.get_real(self.Jprob_out, self.aaos.get_field_components(i, f_alls=self.xfr.subfunctions))
-            cpx.get_imag(self.Jprob_out, self.aaos.get_field_components(i, f_alls=self.xfi.subfunctions))
+            cpx.get_real(self.block_sol, self.aaos.get_field_components(i, f_alls=self.xreal.subfunctions))
+            cpx.get_imag(self.block_sol, self.aaos.get_field_components(i, f_alls=self.ximag.subfunctions))
 
         ######################
         # Undiagonalise - Copy, transfer, IFFT, transfer, scale, copy
         # get array of basis coefficients
-        with self.xfi.dat.vec_ro as v:
-            parray = 1j*v.array_r.reshape((self.aaos.nlocal_timesteps,
-                                           self.blockV.node_set.size))
-        with self.xfr.dat.vec_ro as v:
-            parray += v.array_r.reshape((self.aaos.nlocal_timesteps,
-                                         self.blockV.node_set.size))
+        with self.ximag.dat.vec_ro as v:
+            self.cwrk[:] = 1j*v.array_r.reshape((self.aaos.nlocal_timesteps,
+                                                 self.blockV.node_set.size))
+        with self.xreal.dat.vec_ro as v:
+            self.cwrk[:] += v.array_r.reshape((self.aaos.nlocal_timesteps,
+                                               self.blockV.node_set.size))
         # transfer forward
-        self.a0[:] = parray[:]
+        self.a0[:] = self.cwrk[:]
         self.transfer.forward(self.a0, self.a1)
         # IFFT
         self.a1[:] = ifft(self.a1, axis=0)
         # transfer backward
         self.transfer.backward(self.a1, self.a0)
-        parray[:] = self.a0[:]
+        self.cwrk[:] = self.a0[:]
         # scale
-        parray = ((1.0/self.Gam_slice)*parray.T).T
-        # Copy into xfi, xfr
-        with self.yf.dat.vec_wo as v:
-            v.array[:] = parray.reshape(-1).real
-        with self.yf.dat.vec_ro as v:
+        self.cwrk[:] = ((1.0/self.Gam_slice)*self.cwrk.T).T
+
+        # Copy into ximag, xreal
+        with self.yfunc.dat.vec_wo as v:
+            v.array[:] = self.cwrk.reshape(-1).real
+        with self.yfunc.dat.vec_ro as v:
             v.copy(y)
         ################
 

From 4335aea96a754f5ef15bf7ceb2d4fceb2d10610a Mon Sep 17 00:00:00 2001
From: JHopeCollins <joshua.hope-collins13@imperial.ac.uk>
Date: Thu, 4 May 2023 19:03:59 +0100
Subject: [PATCH 4/9] split up DiagFFTPC.apply into the paradiag 3-step

---
 asQ/diag_preconditioner.py | 169 ++++++++++++++++++++++---------------
 1 file changed, 99 insertions(+), 70 deletions(-)

diff --git a/asQ/diag_preconditioner.py b/asQ/diag_preconditioner.py
index b556fade..3b874c51 100644
--- a/asQ/diag_preconditioner.py
+++ b/asQ/diag_preconditioner.py
@@ -137,10 +137,6 @@ def initialize(self, pc):
         # sanity check
         assert (self.blockV.dim()*paradiag.nlocal_timesteps == W_all.dim())
 
-        # Input/Output wrapper Functions for all-at-once residual being acted on
-        self.xfunc = fd.Function(W_all)  # input
-        self.yfunc = fd.Function(W_all)  # output
-
         # Gamma coefficients
         exponents = np.arange(self.ntimesteps)/self.ntimesteps
         self.Gam = paradiag.alpha**exponents
@@ -180,7 +176,7 @@ def initialize(self, pc):
         self.block_rhs = fd.Function(self.CblockV)
         self.block_sol = fd.Function(self.CblockV)
 
-        # A place to store the real/imag components of the all-at-once residual after fft
+        # A place to store the real/imag components of the all-at-once residual
         self.xreal = fd.Function(W_all)
         self.ximag = fd.Function(W_all)
 
@@ -199,7 +195,6 @@ def initialize(self, pc):
         # a0 is the local part of our other fft working array
         self.a1 = np.zeros(self.p1.subshape, complex)
         self.transfer = self.p0.transfer(self.p1, complex)
-        self.cwrk = np.zeros((self.nlocal_timesteps, nlocal), dtype=complex)
 
         # setting up the Riesz map to project residual into complex space
         rmap_rhs = construct_riesz_map(self.blockV,
@@ -300,7 +295,7 @@ def update(self, pc):
         an operator that is block diagonal in the 2x2 system coupling
         real and imaginary parts.
         '''
-        self.ureduce.assign(0)
+        self.ureduce.assign(0.)
 
         urs = self.ureduce.subfunctions
         for i in range(self.nlocal_timesteps):
@@ -319,57 +314,95 @@ def update(self, pc):
 
     @PETSc.Log.EventDecorator()
     @memprofile
-    def apply(self, pc, x, y):
+    def to_eigenbasis(self, xreal, ximag, output='real,imag'):
+        """
+        In-place transform of the complex vector (xreal, ximag) to the preconditioner (block-)eigenbasis.
+        :arg xreal: real part of input and output
+        :arg ximag: real part of input and output
+        """
+        # copy data into working array
+        with xreal.dat.vec_ro as v:
+            self.a0.real[:] = v.array_r.reshape((self.aaos.nlocal_timesteps,
+                                                 self.blockV.node_set.size))
+        with ximag.dat.vec_ro as v:
+            self.a0.imag[:] = v.array_r.reshape((self.aaos.nlocal_timesteps,
+                                                 self.blockV.node_set.size))
 
-        # copy petsc vec into Function
-        # hopefully this works
-        with self.xfunc.dat.vec_wo as v:
-            x.copy(v)
+        # alpha-weighting
+        self.a0.real[:] = (self.Gam_slice*self.a0.real.T).T
 
-        # get array of basis coefficients
-        with self.xfunc.dat.vec_ro as v:
-            self.cwrk.real[:] = v.array_r.reshape((self.aaos.nlocal_timesteps,
-                                                   self.blockV.node_set.size))
-            self.cwrk.imag[:] = 0
+        # transpose forward
+        self.transfer.forward(self.a0, self.a1)
 
-        # This produces an array whose rows are time slices
-        # and columns are finite element basis coefficients
+        # FFT
+        self.a1[:] = fft(self.a1, axis=0)
 
-        ######################
-        # Diagonalise - scale, transfer, FFT, transfer, Copy
-        # Scale
-        # is there a better way to do this with broadcasting?
+        # transpose backward
+        self.transfer.backward(self.a1, self.a0)
 
-        self.cwrk[:] = (1.0+0.j)*(self.Gam_slice*self.cwrk.T).T
+        # copy back into output
+        if 'real' in output:
+            with xreal.dat.vec_wo as v:
+                v.array[:] = self.a0.real.reshape(-1)
+        if 'imag' in output:
+            with ximag.dat.vec_wo as v:
+                v.array[:] = self.a0.imag.reshape(-1)
 
-        # transfer forward
-        self.a0[:] = self.cwrk[:]
+    @PETSc.Log.EventDecorator()
+    @memprofile
+    def from_eigenbasis(self, xreal, ximag, output='real,imag'):
+        """
+        In-place transform of the complex vector (xreal, ximag) from the preconditioner (block-)eigenbasis.
+        :arg xreal: real part of input and output
+        :arg ximag: real part of input and output
+        """
+        # copy data into working array
+        with xreal.dat.vec_ro as v:
+            self.a0.real[:] = v.array_r.reshape((self.aaos.nlocal_timesteps,
+                                                 self.blockV.node_set.size))
+        with ximag.dat.vec_ro as v:
+            self.a0.imag[:] = v.array_r.reshape((self.aaos.nlocal_timesteps,
+                                                 self.blockV.node_set.size))
+
+        # transpose forward
         self.transfer.forward(self.a0, self.a1)
-        # FFT
-        self.a1[:] = fft(self.a1, axis=0)
 
-        # transfer backward
+        # IFFT
+        self.a1[:] = ifft(self.a1, axis=0)
+
+        # transpose backward
         self.transfer.backward(self.a1, self.a0)
-        # Copy into ximag, xreal
-        self.cwrk[:] = self.a0[:]
 
-        with self.xreal.dat.vec_wo as v:
-            v.array[:] = self.cwrk.real.reshape(-1)
-        with self.ximag.dat.vec_wo as v:
-            v.array[:] = self.cwrk.imag.reshape(-1)
-        #####################
+        # alpha-weighting
+        self.a0[:] = ((1.0/self.Gam_slice)*self.a0.T).T
 
-        # Do the block solves
+        # copy back into output
+        if 'real' in output:
+            with xreal.dat.vec_wo as v:
+                v.array[:] = self.a0.real.reshape(-1)
+        if 'imag' in output:
+            with ximag.dat.vec_wo as v:
+                v.array[:] = self.a0.imag.reshape(-1)
+
+    @PETSc.Log.EventDecorator()
+    @memprofile
+    def solve_blocks(self, xreal, ximag):
+        """
+        Solve each of the blocks in the diagonalised preconditioner with
+        complex vector (xreal,ximag) as the right-hand-sides.
+        :arg xreal: real part of input and output
+        :arg ximag: real part of input and output
+        """
+        def get_field(i, x):
+            return self.aaos.get_field_components(i, f_alls=x.subfunctions)
 
         for i in range(self.aaos.nlocal_timesteps):
             # copy the data into solver input
-            self.riesz_rhs.assign(0.)
-
-            cpx.set_real(self.riesz_rhs, self.aaos.get_field_components(i, f_alls=self.xreal.subfunctions))
-            cpx.set_imag(self.riesz_rhs, self.aaos.get_field_components(i, f_alls=self.ximag.subfunctions))
+            cpx.set_real(self.riesz_rhs, get_field(i, xreal))
+            cpx.set_imag(self.riesz_rhs, get_field(i, ximag))
 
-            # Do a project for Riesz map, to be superceded
-            # when we get Cofunction
+            # Do a project for Riesz map, to be superceded when we get Cofunction
+            self.block_rhs.assign(0.)
             self.riesz_proj.solve(self.block_rhs, self.riesz_rhs)
 
             # solve the block system
@@ -377,35 +410,31 @@ def apply(self, pc, x, y):
             self.Jsolvers[i].solve()
 
             # copy the data from solver output
-            cpx.get_real(self.block_sol, self.aaos.get_field_components(i, f_alls=self.xreal.subfunctions))
-            cpx.get_imag(self.block_sol, self.aaos.get_field_components(i, f_alls=self.ximag.subfunctions))
-
-        ######################
-        # Undiagonalise - Copy, transfer, IFFT, transfer, scale, copy
-        # get array of basis coefficients
-        with self.ximag.dat.vec_ro as v:
-            self.cwrk[:] = 1j*v.array_r.reshape((self.aaos.nlocal_timesteps,
-                                                 self.blockV.node_set.size))
+            cpx.get_real(self.block_sol, get_field(i, xreal))
+            cpx.get_imag(self.block_sol, get_field(i, ximag))
+
+    @PETSc.Log.EventDecorator()
+    @memprofile
+    def apply(self, pc, x, y):
+
+        # copy input Vec into Function
+        with self.xreal.dat.vec_wo as v:
+            x.copy(v)
+        with self.ximag.dat.vec_wo as v:
+            v.array[:] = 0
+
+        # forward FFT
+        self.to_eigenbasis(self.xreal, self.ximag)
+
+        # Do the block solves
+        self.solve_blocks(self.xreal, self.ximag)
+
+        # backward IFFT
+        self.from_eigenbasis(self.xreal, self.ximag, output='real')
+
+        # copy solution into output Vec
         with self.xreal.dat.vec_ro as v:
-            self.cwrk[:] += v.array_r.reshape((self.aaos.nlocal_timesteps,
-                                               self.blockV.node_set.size))
-        # transfer forward
-        self.a0[:] = self.cwrk[:]
-        self.transfer.forward(self.a0, self.a1)
-        # IFFT
-        self.a1[:] = ifft(self.a1, axis=0)
-        # transfer backward
-        self.transfer.backward(self.a1, self.a0)
-        self.cwrk[:] = self.a0[:]
-        # scale
-        self.cwrk[:] = ((1.0/self.Gam_slice)*self.cwrk.T).T
-
-        # Copy into ximag, xreal
-        with self.yfunc.dat.vec_wo as v:
-            v.array[:] = self.cwrk.reshape(-1).real
-        with self.yfunc.dat.vec_ro as v:
             v.copy(y)
-        ################
 
         self._record_diagnostics()
 

From b5e6e37cf2281b4148d158575e6220e2ebbaac88 Mon Sep 17 00:00:00 2001
From: JHopeCollins <joshua.hope-collins13@imperial.ac.uk>
Date: Thu, 4 May 2023 19:19:14 +0100
Subject: [PATCH 5/9] avoid a copy from a PETSc Vec

---
 asQ/diag_preconditioner.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/asQ/diag_preconditioner.py b/asQ/diag_preconditioner.py
index 3b874c51..f3a2cb3b 100644
--- a/asQ/diag_preconditioner.py
+++ b/asQ/diag_preconditioner.py
@@ -397,16 +397,17 @@ def get_field(i, x):
             return self.aaos.get_field_components(i, f_alls=x.subfunctions)
 
         for i in range(self.aaos.nlocal_timesteps):
+            self.block_rhs.assign(0.)
+            self.block_sol.assign(0.)
+
             # copy the data into solver input
             cpx.set_real(self.riesz_rhs, get_field(i, xreal))
             cpx.set_imag(self.riesz_rhs, get_field(i, ximag))
 
             # Do a project for Riesz map, to be superceded when we get Cofunction
-            self.block_rhs.assign(0.)
             self.riesz_proj.solve(self.block_rhs, self.riesz_rhs)
 
             # solve the block system
-            self.block_sol.assign(0.)
             self.Jsolvers[i].solve()
 
             # copy the data from solver output
@@ -420,8 +421,7 @@ def apply(self, pc, x, y):
         # copy input Vec into Function
         with self.xreal.dat.vec_wo as v:
             x.copy(v)
-        with self.ximag.dat.vec_wo as v:
-            v.array[:] = 0
+        self.ximag.assign(0.)
 
         # forward FFT
         self.to_eigenbasis(self.xreal, self.ximag)

From a567d0d2109a449902b790e82aa6464cab9e0df5 Mon Sep 17 00:00:00 2001
From: JHopeCollins <joshua.hope-collins13@imperial.ac.uk>
Date: Tue, 9 May 2023 10:11:51 +0100
Subject: [PATCH 6/9] more explicit naming in pc

---
 asQ/diag_preconditioner.py | 84 +++++++++++++++++++-------------------
 1 file changed, 43 insertions(+), 41 deletions(-)

diff --git a/asQ/diag_preconditioner.py b/asQ/diag_preconditioner.py
index f3a2cb3b..ca16ca5d 100644
--- a/asQ/diag_preconditioner.py
+++ b/asQ/diag_preconditioner.py
@@ -131,19 +131,19 @@ def initialize(self, pc):
         self.w_all = self.aaos.w_all
 
         # basic model function space
-        self.blockV = self.aaos.function_space
+        self.function_space = self.aaos.function_space
 
         W_all = self.aaos.function_space_all
         # sanity check
-        assert (self.blockV.dim()*paradiag.nlocal_timesteps == W_all.dim())
+        assert (self.function_space.dim()*paradiag.nlocal_timesteps == W_all.dim())
 
         # Gamma coefficients
         exponents = np.arange(self.ntimesteps)/self.ntimesteps
-        self.Gam = paradiag.alpha**exponents
+        self.gamma = paradiag.alpha**exponents
 
         slice_begin = self.aaos.transform_index(0, from_range='slice', to_range='window')
         slice_end = slice_begin + self.nlocal_timesteps
-        self.Gam_slice = self.Gam[slice_begin:slice_end]
+        self.gamma_slice = self.gamma[slice_begin:slice_end]
 
         # circulant eigenvalues
         C1col = np.zeros(self.ntimesteps)
@@ -154,27 +154,27 @@ def initialize(self, pc):
         C1col[:2] = np.array([1, -1])/dt
         C2col[:2] = np.array([theta, 1-theta])
 
-        self.D1 = fft(self.Gam*C1col, norm='backward')
-        self.D2 = fft(self.Gam*C2col, norm='backward')
+        self.D1 = fft(self.gamma*C1col, norm='backward')
+        self.D2 = fft(self.gamma*C2col, norm='backward')
 
         # Block system setup
-        # First need to build the vector function space version of blockV
-        self.CblockV = cpx.FunctionSpace(self.blockV)
+        # First need to build the vector function space version of function_space
+        self.cpx_function_space = cpx.FunctionSpace(self.function_space)
 
         # set the boundary conditions to zero for the residual
-        self.CblockV_bcs = tuple((cb
-                                  for bc in self.aaos.boundary_conditions
-                                  for cb in cpx.DirichletBC(self.CblockV, self.blockV,
-                                                            bc, 0*bc.function_arg)))
+        self.block_bcs = tuple((cb
+                                for bc in self.aaos.boundary_conditions
+                                for cb in cpx.DirichletBC(self.cpx_function_space, self.function_space,
+                                                          bc, 0*bc.function_arg)))
 
         # function to do global reduction into for average block jacobian
         if self.jac_average == 'window':
-            self.ureduce = fd.Function(self.blockV)
-            self.uwrk = fd.Function(self.blockV)
+            self.ureduce = fd.Function(self.function_space)
+            self.uwrk = fd.Function(self.function_space)
 
         # input and output functions to the block solve
-        self.block_rhs = fd.Function(self.CblockV)
-        self.block_sol = fd.Function(self.CblockV)
+        self.block_rhs = fd.Function(self.cpx_function_space)
+        self.block_sol = fd.Function(self.cpx_function_space)
 
         # A place to store the real/imag components of the all-at-once residual
         self.xreal = fd.Function(W_all)
@@ -184,7 +184,7 @@ def initialize(self, pc):
         # construct simply dist array and 1d fftn:
         subcomm = Subcomm(self.ensemble.ensemble_comm, [0, 1])
         # dimensions of space-time data in this ensemble_comm
-        nlocal = self.blockV.node_set.size
+        nlocal = self.function_space.node_set.size
         NN = np.array([self.ntimesteps, nlocal], dtype=int)
         # transfer pencil is aligned along axis 1
         self.p0 = Pencil(subcomm, NN, axis=1)
@@ -197,8 +197,8 @@ def initialize(self, pc):
         self.transfer = self.p0.transfer(self.p1, complex)
 
         # setting up the Riesz map to project residual into complex space
-        rmap_rhs = construct_riesz_map(self.blockV,
-                                       self.CblockV,
+        rmap_rhs = construct_riesz_map(self.function_space,
+                                       self.cpx_function_space,
                                        prefix=f"{prefix}{self.prefix}mass_")
         self.riesz_proj, self.riesz_rhs = rmap_rhs
 
@@ -211,12 +211,12 @@ def initialize(self, pc):
         # This is constructed by cpx.derivative
 
         #  Building the nonlinear operator
-        self.Jsolvers = []
+        self.block_solvers = []
         form_mass = self.aaos.form_mass
         form_function = self.aaos.form_function
 
         # Now need to build the block solver
-        self.u0 = fd.Function(self.CblockV)  # time average to linearise around
+        self.u0 = fd.Function(self.cpx_function_space)  # time average to linearise around
 
         # building the block problem solvers
         for i in range(self.nlocal_timesteps):
@@ -224,13 +224,13 @@ def initialize(self, pc):
             d1 = self.D1[ii]
             d2 = self.D2[ii]
 
-            M, D1r, D1i = cpx.BilinearForm(self.CblockV, d1, form_mass, return_z=True)
+            M, D1r, D1i = cpx.BilinearForm(self.cpx_function_space, d1, form_mass, return_z=True)
             K, D2r, D2i = cpx.derivative(d2, form_function, self.u0, return_z=True)
 
             A = M + K
 
             # The rhs
-            v = fd.TestFunction(self.CblockV)
+            v = fd.TestFunction(self.cpx_function_space)
             L = fd.inner(v, self.block_rhs)*fd.dx
 
             # pass sigma into PC:
@@ -256,22 +256,22 @@ def initialize(self, pc):
                     block_prefix = f"{block_prefix}{str(ii)}_"
                     break
 
-            jprob = fd.LinearVariationalProblem(A, L, self.block_sol,
-                                                bcs=self.CblockV_bcs)
-            Jsolver = fd.LinearVariationalSolver(jprob,
-                                                 appctx=appctx_h,
-                                                 options_prefix=block_prefix)
+            block_prob = fd.LinearVariationalProblem(A, L, self.block_sol,
+                                                     bcs=self.block_bcs)
+            block_solver = fd.LinearVariationalSolver(block_prob,
+                                                      appctx=appctx_h,
+                                                      options_prefix=block_prefix)
             # multigrid transfer manager
             if 'diag_transfer_managers' in paradiag.block_ctx:
-                # Jsolver.set_transfer_manager(paradiag.block_ctx['diag_transfer_managers'][ii])
+                # block_solver.set_transfer_manager(paradiag.block_ctx['diag_transfer_managers'][ii])
                 tm = paradiag.block_ctx['diag_transfer_managers'][i]
-                Jsolver.set_transfer_manager(tm)
-                tm_set = (Jsolver._ctx.transfer_manager is tm)
+                block_solver.set_transfer_manager(tm)
+                tm_set = (block_solver._ctx.transfer_manager is tm)
 
                 if tm_set is False:
-                    print(f"transfer manager not set on Jsolvers[{ii}]")
+                    print(f"transfer manager not set on block_solvers[{ii}]")
 
-            self.Jsolvers.append(Jsolver)
+            self.block_solvers.append(block_solver)
 
         self.initialized = True
 
@@ -282,7 +282,7 @@ def _record_diagnostics(self):
         Must be called exactly once at the end of each apply()
         """
         for i in range(self.aaos.nlocal_timesteps):
-            its = self.Jsolvers[i].snes.getLinearSolveIterations()
+            its = self.block_solvers[i].snes.getLinearSolveIterations()
             self.paradiag.block_iterations.dlocal[i] += its
 
     @PETSc.Log.EventDecorator()
@@ -319,17 +319,18 @@ def to_eigenbasis(self, xreal, ximag, output='real,imag'):
         In-place transform of the complex vector (xreal, ximag) to the preconditioner (block-)eigenbasis.
         :arg xreal: real part of input and output
         :arg ximag: real part of input and output
+        :arg output: which parts of the result to copy the back into xreal and/or ximag.
         """
         # copy data into working array
         with xreal.dat.vec_ro as v:
             self.a0.real[:] = v.array_r.reshape((self.aaos.nlocal_timesteps,
-                                                 self.blockV.node_set.size))
+                                                 self.function_space.node_set.size))
         with ximag.dat.vec_ro as v:
             self.a0.imag[:] = v.array_r.reshape((self.aaos.nlocal_timesteps,
-                                                 self.blockV.node_set.size))
+                                                 self.function_space.node_set.size))
 
         # alpha-weighting
-        self.a0.real[:] = (self.Gam_slice*self.a0.real.T).T
+        self.a0.real[:] = (self.gamma_slice*self.a0.real.T).T
 
         # transpose forward
         self.transfer.forward(self.a0, self.a1)
@@ -355,14 +356,15 @@ def from_eigenbasis(self, xreal, ximag, output='real,imag'):
         In-place transform of the complex vector (xreal, ximag) from the preconditioner (block-)eigenbasis.
         :arg xreal: real part of input and output
         :arg ximag: real part of input and output
+        :arg output: which parts of the result to copy the back into xreal and/or ximag.
         """
         # copy data into working array
         with xreal.dat.vec_ro as v:
             self.a0.real[:] = v.array_r.reshape((self.aaos.nlocal_timesteps,
-                                                 self.blockV.node_set.size))
+                                                 self.function_space.node_set.size))
         with ximag.dat.vec_ro as v:
             self.a0.imag[:] = v.array_r.reshape((self.aaos.nlocal_timesteps,
-                                                 self.blockV.node_set.size))
+                                                 self.function_space.node_set.size))
 
         # transpose forward
         self.transfer.forward(self.a0, self.a1)
@@ -374,7 +376,7 @@ def from_eigenbasis(self, xreal, ximag, output='real,imag'):
         self.transfer.backward(self.a1, self.a0)
 
         # alpha-weighting
-        self.a0[:] = ((1.0/self.Gam_slice)*self.a0.T).T
+        self.a0[:] = ((1.0/self.gamma_slice)*self.a0.T).T
 
         # copy back into output
         if 'real' in output:
@@ -408,7 +410,7 @@ def get_field(i, x):
             self.riesz_proj.solve(self.block_rhs, self.riesz_rhs)
 
             # solve the block system
-            self.Jsolvers[i].solve()
+            self.block_solvers[i].solve()
 
             # copy the data from solver output
             cpx.get_real(self.block_sol, get_field(i, xreal))

From 68b1d8c2cb5686689171a7273bef9de4850d1fcf Mon Sep 17 00:00:00 2001
From: JHopeCollins <joshua.hope-collins13@imperial.ac.uk>
Date: Tue, 9 May 2023 10:28:03 +0100
Subject: [PATCH 7/9] factor constructing pc blocks into its own function

---
 asQ/diag_preconditioner.py | 124 ++++++++++++++++++-------------------
 1 file changed, 60 insertions(+), 64 deletions(-)

diff --git a/asQ/diag_preconditioner.py b/asQ/diag_preconditioner.py
index ca16ca5d..457edad8 100644
--- a/asQ/diag_preconditioner.py
+++ b/asQ/diag_preconditioner.py
@@ -10,7 +10,7 @@
 import asQ.complex_proxy.vector as cpx
 
 
-def construct_riesz_map(V, W, prefix, riesz_options=None):
+def construct_riesz_map(W, prefix, fieldsplit=False, riesz_options=None):
     """
     Construct projection into W assuming W is a complex-proxy
     FunctionSpace for the real FunctionSpace V.
@@ -30,7 +30,7 @@ def construct_riesz_map(V, W, prefix, riesz_options=None):
         }
 
     # mixed mass matrices are decoupled so solve seperately
-    if isinstance(V.ufl_element(), fd.MixedElement):
+    if fieldsplit:
         full_riesz_options = {
             'ksp_type': 'preonly',
             'mat_type': 'nest',
@@ -197,11 +197,26 @@ def initialize(self, pc):
         self.transfer = self.p0.transfer(self.p1, complex)
 
         # setting up the Riesz map to project residual into complex space
-        rmap_rhs = construct_riesz_map(self.function_space,
-                                       self.cpx_function_space,
-                                       prefix=f"{prefix}{self.prefix}mass_")
+        is_mixed = isinstance(self.function_space.ufl_element(), fd.MixedElement)
+        rmap_rhs = construct_riesz_map(self.cpx_function_space,
+                                       prefix=f"{prefix}{self.prefix}mass_",
+                                       fieldsplit=is_mixed)
         self.riesz_proj, self.riesz_rhs = rmap_rhs
 
+        # Now need to build the block solvers
+
+        # time-average function to linearise around
+        self.u0 = fd.Function(self.cpx_function_space)
+
+        self.block_solvers = tuple((self._make_block(i, f"{prefix}{self.prefix}")
+                                    for i in range(self.nlocal_timesteps)))
+
+        self.initialized = True
+
+    def _make_block(self, i, prefix):
+        """
+        Construct the LinearVariationalSolver for block index i.
+        """
         # building the Jacobian of the nonlinear term
         # what we want is a block diagonal matrix in the 2x2 system
         # coupling the real and imaginary parts.
@@ -210,70 +225,51 @@ def initialize(self, pc):
         # parts and then linearising.
         # This is constructed by cpx.derivative
 
-        #  Building the nonlinear operator
-        self.block_solvers = []
         form_mass = self.aaos.form_mass
         form_function = self.aaos.form_function
 
-        # Now need to build the block solver
-        self.u0 = fd.Function(self.cpx_function_space)  # time average to linearise around
+        ii = self.aaos.transform_index(i, from_range='slice', to_range='window')
+        d1 = self.D1[ii]
+        d2 = self.D2[ii]
 
-        # building the block problem solvers
-        for i in range(self.nlocal_timesteps):
-            ii = self.aaos.transform_index(i, from_range='slice', to_range='window')
-            d1 = self.D1[ii]
-            d2 = self.D2[ii]
-
-            M, D1r, D1i = cpx.BilinearForm(self.cpx_function_space, d1, form_mass, return_z=True)
-            K, D2r, D2i = cpx.derivative(d2, form_function, self.u0, return_z=True)
-
-            A = M + K
-
-            # The rhs
-            v = fd.TestFunction(self.cpx_function_space)
-            L = fd.inner(v, self.block_rhs)*fd.dx
-
-            # pass sigma into PC:
-            sigma = self.D1[ii]**2/self.D2[ii]
-            sigma_inv = self.D2[ii]**2/self.D1[ii]
-            appctx_h = {}
-            appctx_h["sr"] = fd.Constant(np.real(sigma))
-            appctx_h["si"] = fd.Constant(np.imag(sigma))
-            appctx_h["sinvr"] = fd.Constant(np.real(sigma_inv))
-            appctx_h["sinvi"] = fd.Constant(np.imag(sigma_inv))
-            appctx_h["D2r"] = D2r
-            appctx_h["D2i"] = D2i
-            appctx_h["D1r"] = D1r
-            appctx_h["D1i"] = D1i
-
-            # Options with prefix 'diagfft_block_' apply to all blocks by default
-            # If any options with prefix 'diagfft_block_{i}' exist, where i is the
-            # block number, then this prefix is used instead (like pc fieldsplit)
-
-            block_prefix = f"{prefix}{self.prefix}block_"
-            for k, v in PETSc.Options().getAll().items():
-                if k.startswith(f"{block_prefix}{str(ii)}_"):
-                    block_prefix = f"{block_prefix}{str(ii)}_"
-                    break
-
-            block_prob = fd.LinearVariationalProblem(A, L, self.block_sol,
-                                                     bcs=self.block_bcs)
-            block_solver = fd.LinearVariationalSolver(block_prob,
-                                                      appctx=appctx_h,
-                                                      options_prefix=block_prefix)
-            # multigrid transfer manager
-            if 'diag_transfer_managers' in paradiag.block_ctx:
-                # block_solver.set_transfer_manager(paradiag.block_ctx['diag_transfer_managers'][ii])
-                tm = paradiag.block_ctx['diag_transfer_managers'][i]
-                block_solver.set_transfer_manager(tm)
-                tm_set = (block_solver._ctx.transfer_manager is tm)
-
-                if tm_set is False:
-                    print(f"transfer manager not set on block_solvers[{ii}]")
-
-            self.block_solvers.append(block_solver)
+        M, D1r, D1i = cpx.BilinearForm(self.cpx_function_space, d1, form_mass, return_z=True)
+        K, D2r, D2i = cpx.derivative(d2, form_function, self.u0, return_z=True)
 
-        self.initialized = True
+        A = M + K
+
+        # The rhs
+        v = fd.TestFunction(self.cpx_function_space)
+        L = fd.inner(v, self.block_rhs)*fd.dx
+
+        # pass sigma into PC:
+        appctx_h = {}
+
+        # Options with prefix 'diagfft_block_' apply to all blocks by default
+        # If any options with prefix 'diagfft_block_{i}' exist, where i is the
+        # block number, then this prefix is used instead (like pc fieldsplit)
+
+        block_prefix = f"{prefix}block_"
+        for k, v in PETSc.Options().getAll().items():
+            if k.startswith(f"{block_prefix}{str(ii)}_"):
+                block_prefix = f"{block_prefix}{str(ii)}_"
+                break
+
+        block_prob = fd.LinearVariationalProblem(A, L, self.block_sol,
+                                                 bcs=self.block_bcs)
+        block_solver = fd.LinearVariationalSolver(block_prob,
+                                                  appctx=appctx_h,
+                                                  options_prefix=block_prefix)
+        # multigrid transfer manager
+        if 'diag_transfer_managers' in self.paradiag.block_ctx:
+            # block_solver.set_transfer_manager(self.paradiag.block_ctx['diag_transfer_managers'][ii])
+            tm = self.paradiag.block_ctx['diag_transfer_managers'][i]
+            block_solver.set_transfer_manager(tm)
+            tm_set = (block_solver._ctx.transfer_manager is tm)
+
+            if tm_set is False:
+                print(f"transfer manager not set on block_solvers[{ii}]")
+
+        return block_solver
 
     def _record_diagnostics(self):
         """

From cf0a727d74a4b14e5fcb0c5ae2471786b947a026 Mon Sep 17 00:00:00 2001
From: Josh Hope-Collins <joshua.hope-collins13@imperial.ac.uk>
Date: Tue, 27 Jun 2023 09:51:16 +0100
Subject: [PATCH 8/9] merge with master

---
 .github/workflows/test.yml                    |   7 +-
 .gitignore                                    |   1 +
 asQ/__init__.py                               |   2 +-
 asQ/allatoncesystem.py                        | 284 +++++++++++++++---
 asQ/common.py                                 |   9 +
 asQ/diag_preconditioner.py                    | 104 +++++--
 asQ/paradiag.py                               |  84 ++++--
 examples/advection/periodic_dg_advection.py   |  15 +-
 examples/advection/unsteady_dg_advection.py   | 256 ++++++++++++++++
 examples/heat/heat.py                         | 197 ++++++++++++
 examples/heat/mms.py                          | 200 ++++++++++++
 .../shallow_water/galewsky_comparison.py      |   2 +-
 .../serial/shallow_water/galewsky_serial.py   |  14 +-
 .../serial/shallow_water/linear_swe_serial.py |  39 ++-
 examples/shallow_water/galewsky.py            |  97 +++---
 .../shallow_water/linear_gravity_bumps.py     |  26 +-
 examples/shallow_water/williamson2.py         | 185 +++++++-----
 examples/shallow_water/williamson5.py         |   1 -
 examples/stratigraphic_model/mms.py           |  86 ++++++
 examples/stratigraphic_model/serial.py        |  77 +++++
 examples/stratigraphic_model/stratigraphic.py | 221 ++++++++++++++
 tests/conftest.py                             |  66 ----
 tests/test_paradiag.py                        | 275 +++++++++++++++--
 utils/serial/miniapp.py                       |  13 +-
 utils/shallow_water/linear.py                 |   8 +-
 utils/shallow_water/miniapp.py                |  22 +-
 utils/shallow_water/nonlinear.py              |   8 +-
 27 files changed, 1937 insertions(+), 362 deletions(-)
 create mode 100644 asQ/common.py
 create mode 100644 examples/advection/unsteady_dg_advection.py
 create mode 100644 examples/heat/heat.py
 create mode 100644 examples/heat/mms.py
 create mode 100644 examples/stratigraphic_model/mms.py
 create mode 100644 examples/stratigraphic_model/serial.py
 create mode 100644 examples/stratigraphic_model/stratigraphic.py
 delete mode 100644 tests/conftest.py

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 79127797..36b30e93 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -43,13 +43,10 @@ jobs:
           . /home/firedrake/firedrake/bin/activate
           python -m pip install pytest-timeout
           python -m pip install pytest-cov
-          python -m pip uninstall --yes pytest-mpi
       - name: Install
         run: |
           . /home/firedrake/firedrake/bin/activate
           firedrake-status
-          FFTW_INCLUDE_DIR=/home/firedrake/firedrake/src/petsc/default/include FFTW_LIBRARY_DIR=/home/firedrake/firedrake/src/petsc/default/lib python -m pip install mpi4py-fft 
-          python -m pip install --upgrade scipy
           python -m pip install -e .
       - name: Lint
         run: |
@@ -58,4 +55,6 @@ jobs:
       - name: Test
         run: |
           . /home/firedrake/firedrake/bin/activate
-          python -m pytest -vvv -s -n 4 --durations=0 --timeout=600 --cov=asQ tests/
+          python --version
+          python -m pytest --version
+          python -m pytest -vvv -s -n 6 --durations=0 --timeout=600 --cov=asQ --cov-report=term tests/
diff --git a/.gitignore b/.gitignore
index b50630a6..9ee5ac9e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,4 @@
 *.dat
 *.out
 .coverage*
+**/metrics/*
diff --git a/asQ/__init__.py b/asQ/__init__.py
index 5e351ad5..ad5fa4ad 100644
--- a/asQ/__init__.py
+++ b/asQ/__init__.py
@@ -1,7 +1,7 @@
 from .parallel_arrays import DistributedDataLayout1D, SharedArray, OwnedArray  # noqa: F401
 from .paradiag import paradiag, create_ensemble  # noqa: F401
 from .diag_preconditioner import DiagFFTPC  # noqa: F401
-from .allatoncesystem import AllAtOnceSystem, JacobianMatrix  # noqa: F401
+from .allatoncesystem import time_average, AllAtOnceSystem, JacobianMatrix  # noqa: F401
 from .post import (write_timesteps,  # noqa: F401
                    write_timeseries,  # noqa: F401
                    write_solver_parameters,  # noqa: F401
diff --git a/asQ/allatoncesystem.py b/asQ/allatoncesystem.py
index 5bd1a7ec..69c5f52a 100644
--- a/asQ/allatoncesystem.py
+++ b/asQ/allatoncesystem.py
@@ -2,44 +2,197 @@
 from firedrake.petsc import PETSc
 from functools import reduce
 from operator import mul
-from .profiling import memprofile
+from asQ.profiling import memprofile
+from asQ.common import get_option_from_list
+from functools import partial
 
 from asQ.parallel_arrays import in_range, DistributedDataLayout1D
 
 
+def time_average(aaos, uout, uwrk, uall=None, average='window'):
+    """
+    Compute the time average of an all-at-once function
+    over either entire window or current slice.
+
+    :arg aaos: AllAtOnceSystem describing the function to average.
+    :arg uout: Function to save average into.
+    :arg uwrk: Function to use as working buffer.
+    :arg uall: all-at-once function to average. If None the AllAtOnceSystem's is used.
+    :arg average: range of time-average.
+        'window': compute over all timesteps in all-at-once function.
+        'slice': compute only over timesteps on local ensemble member.
+    """
+    if uall is None:
+        uall = aaos.w_all
+    ualls = uall.subfunctions
+
+    # accumulate over local slice
+    uout.assign(0)
+    uouts = uout.subfunctions
+    for i in range(aaos.nlocal_timesteps):
+        for uo, uc in zip(uouts, aaos.get_field_components(i, f_alls=ualls)):
+            uo.assign(uo + uc)
+
+    if average == 'slice':
+        nsamples = aaos.nlocal_timesteps
+        uout /= fd.Constant(nsamples)
+    elif average == 'window':
+        aaos.ensemble.allreduce(uout, uwrk)
+        nsamples = aaos.ntimesteps
+        uout.assign(uwrk/fd.Constant(nsamples))
+    else:
+        raise ValueError(f"average type must be 'window' or 'slice', not {average}")
+
+    return
+
+
 class JacobianMatrix(object):
+    """
+    PETSc options:
+
+    'aaos_jacobian_linearisation': <'consistent', 'user'>
+        Which form to linearise when constructing the Jacobian.
+        Default is 'consistent'.
+
+        'consistent': use the same form used in the AllAtOnceSystem residual.
+        'user': use the alternative forms given to the AllAtOnceSystem.
+
+    'aaos_jacobian_state': <'current', 'window', 'slice', 'linear', 'initial', 'reference', 'user'>
+        Which state to linearise around when constructing the Jacobian.
+        Default is 'current'.
+
+        'current': use the current state of the AllAtOnceSystem (i.e. current Newton iterate).
+        'window': use the time average over the entire AllAtOnceSystem.
+        'slice': use the time average over timesteps on the local Ensemble member.
+        'linear': the form being linearised is linear, so no update to the state is needed.
+        'initial': use the initial condition is used for all timesteps.
+        'reference': use the reference state of the AllAtOnceSystem for all timesteps.
+        'user': the state will be set manually by the user so no update is needed.
+    """
+    prefix = "aaos_jacobian_"
+
     @memprofile
-    def __init__(self, aaos):
+    def __init__(self, aaos, snes=None):
         r"""
         Python matrix for the Jacobian of the all at once system
         :param aaos: The AllAtOnceSystem object
         """
         self.aaos = aaos
-        self.u = fd.Function(self.aaos.function_space_all)  # for the input function
-        self.F = fd.Function(self.aaos.function_space_all)  # for the output residual
-        self.F_prev = fd.Function(self.aaos.function_space_all)  # Where we compute the
-        # part of the output residual from neighbouring contributions
-
-        self.urecv = fd.Function(self.aaos.function_space)  # will contain the previous time value i.e. 3*r-1
-        # Jform missing contributions from the previous step
-        # Find u1 s.t. F[u1, u2, u3; v] = 0 for all v
-        # definition:
-        # dF_{u1}[u1, u2, u3; delta_u, v] =
-        #  lim_{eps -> 0} (F[u1+eps*delta_u,u2,u3;v]
-        #                  - F[u1,u2,u3;v])/eps
-        # Newton, solves for delta_u such that
-        # dF_{u1}[u1, u2, u3; delta_u, v] = -F[u1,u2,u3; v], for all v
-        # then updates u1 += delta_u
-        self.Jform = fd.derivative(self.aaos.aao_form, self.aaos.w_all)
+
+        if snes is not None:
+            self.snes = snes
+            prefix = snes.getOptionsPrefix()
+            prefix += self.prefix
+
+        # function to linearise around, and timestep from end of previous slice
+        self.u = fd.Function(self.aaos.function_space_all)
+        self.urecv = fd.Function(self.aaos.function_space)
+
+        # function the Jacobian acts on, and contribution from timestep at end of previous slice
+        self.x = fd.Function(self.aaos.function_space_all)
+        self.xrecv = fd.Function(self.aaos.function_space)
+
+        # output residual, and contribution from timestep at end of previous slice
+        self.F = fd.Function(self.aaos.function_space_all)
+        self.F_prev = fd.Function(self.aaos.function_space_all)
+
+        # working buffers for calculating time average when needed
+        self.ureduce = fd.Function(self.aaos.function_space)
+        self.uwrk = fd.Function(self.aaos.function_space)
+
+        # option for what form to linearise
+        valid_linearisations = ['consistent', 'user']
+
+        if snes is None:
+            self.jacobian_mass = aaos.jacobian_mass
+            self.jacobian_function = aaos.jacobian_function
+        else:
+            linear_option = f"{prefix}linearisation"
+
+            linear = get_option_from_list(linear_option, valid_linearisations, default_index=0)
+
+            if linear == 'consistent':
+                self.jacobian_mass = aaos.form_mass
+                self.jacobian_function = aaos.form_function
+            elif linear == 'user':
+                self.jacobian_mass = aaos.jacobian_mass
+                self.jacobian_function = aaos.jacobian_function
+
+        # all-at-once form to linearise
+        self.aao_form = self.aaos.construct_aao_form(wall=self.u, wrecv=self.urecv,
+                                                     mass=self.jacobian_mass,
+                                                     function=self.jacobian_function)
+
+        # Jform without contributions from the previous step
+        self.Jform = fd.derivative(self.aao_form, self.u)
         # Jform contributions from the previous step
-        self.Jform_prev = fd.derivative(self.aaos.aao_form,
-                                        self.aaos.w_recv)
+        self.Jform_prev = fd.derivative(self.aao_form, self.urecv)
+
+        self.Jaction = fd.action(self.Jform, self.x)
+        self.Jaction_prev = fd.action(self.Jform_prev, self.xrecv)
+
+        # option for what state to linearise around
+        valid_jacobian_states = ['current', 'window', 'slice', 'linear', 'initial', 'reference', 'user']
+
+        if snes is None:
+            self.jacobian_state = lambda: 'current'
+        else:
+            state_option = f"{prefix}state"
+
+            self.jacobian_state = partial(get_option_from_list,
+                                          state_option, valid_jacobian_states, default_index=0)
+
+        jacobian_state = self.jacobian_state()
+
+        if jacobian_state == 'reference' and self.aaos.reference_state is None:
+            raise ValueError("AllAtOnceSystem must be provided a reference state to use \'reference\' for aaos_jacobian_state.")
+
+        self.update()
+
+    def update(self, X=None):
+        # update the state to linearise around from the current all-at-once solution
+
+        aaos = self.aaos
+        jacobian_state = self.jacobian_state()
+
+        def uniform_state(u):
+            self.urecv.assign(u)
+            for i in range(aaos.nlocal_timesteps):
+                aaos.set_field(i, u, f_alls=self.u.subfunctions)
+
+        if jacobian_state == 'linear':
+            pass
+
+        elif jacobian_state == 'current':
+            if X is None:
+                self.u.assign(aaos.w_all)
+                self.urecv.assign(aaos.w_recv)
+            else:
+                aaos.update(X, wall=self.u, wrecv=self.urecv, blocking=True)
+
+        elif jacobian_state in ('window', 'slice'):
+            time_average(aaos, self.ureduce, self.uwrk, average=jacobian_state)
+            uniform_state(self.ureduce)
+
+        elif jacobian_state == 'initial':
+            uniform_state(aaos.initial_condition)
+
+        elif jacobian_state == 'reference':
+            uniform_state(aaos.reference_state)
+
+        elif jacobian_state == 'user':
+            pass
+
+        return
+
+        self.Jaction = fd.action(self.Jform, self.u)
+        self.Jaction_prev = fd.action(self.Jform_prev, self.urecv)
 
     @PETSc.Log.EventDecorator()
     @memprofile
     def mult(self, mat, X, Y):
 
-        self.aaos.update(X, wall=self.u, wrecv=self.urecv, blocking=True)
+        self.aaos.update(X, wall=self.x, wrecv=self.xrecv, blocking=True)
 
         # Set the flag for the circulant option
         if self.aaos.circ in ["quasi", "picard"]:
@@ -48,9 +201,8 @@ def mult(self, mat, X, Y):
             self.aaos.Circ.assign(0.0)
 
         # assembly stage
-        fd.assemble(fd.action(self.Jform, self.u), tensor=self.F)
-        fd.assemble(fd.action(self.Jform_prev, self.urecv),
-                    tensor=self.F_prev)
+        fd.assemble(self.Jaction, tensor=self.F)
+        fd.assemble(self.Jaction_prev, tensor=self.F_prev)
         self.F += self.F_prev
 
         # unset flag if alpha-circulant approximation only in Jacobian
@@ -64,7 +216,7 @@ def mult(self, mat, X, Y):
         # at boundary nodes
         for bc in self.aaos.boundary_conditions_all:
             bc.homogenize()
-            bc.apply(self.F, u=self.u)
+            bc.apply(self.F, u=self.x)
             bc.restore()
 
         with self.F.dat.vec_ro as v:
@@ -78,6 +230,9 @@ def __init__(self,
                  dt, theta,
                  form_mass, form_function,
                  w0, bcs=[],
+                 reference_state=None,
+                 jacobian_function=None,
+                 jacobian_mass=None,
                  circ="", alpha=1e-3):
         """
         The all-at-once system representing multiple timesteps of a time-dependent finite-element problem.
@@ -86,6 +241,15 @@ def __init__(self,
         :arg time_partition: a list of integers for the number of timesteps stored on each ensemble rank.
         :arg w0: a Function containing the initial data.
         :arg bcs: a list of DirichletBC boundary conditions on w0.function_space.
+        :arg reference_state: a Function in W to use as a reference state
+            e.g. in DiagFFTPC
+        :arg circ: a string describing the option on where to use the
+            alpha-circulant modification. "picard" - do a nonlinear wave
+            form relaxation method. "quasi" - do a modified Newton
+            method with alpha-circulant modification added to the
+            Jacobian. To make the alpha circulant modification only in the
+            preconditioner, simply set ksp_type:preonly in the solve options.
+        :arg alpha: float, circulant matrix parameter
         """
         self.layout = DistributedDataLayout1D(time_partition, ensemble.ensemble_comm)
 
@@ -95,17 +259,35 @@ def __init__(self,
         self.nlocal_timesteps = self.layout.local_size
         self.ntimesteps = self.layout.global_size
 
-        self.initial_condition = w0
         self.function_space = w0.function_space()
+        self.initial_condition = fd.Function(self.function_space).assign(w0)
+
+        if reference_state is None:
+            self.reference_state = None
+        else:
+            if reference_state.function_space() != w0.function_space():
+                raise ValueError("AllAtOnceSystem reference state must be in the"
+                                 + " same function space as the initial condition.")
+            self.reference_state = fd.Function(self.function_space).assign(reference_state)
+
+        # need to make copy of bcs too instead of taking a reference
         self.boundary_conditions = bcs
         self.ncomponents = len(self.function_space.subfunctions)
 
         self.dt = dt
+        self.time = tuple(fd.Constant(0) for _ in range(self.nlocal_timesteps))
+        for n in range((self.nlocal_timesteps)):
+            self.time[n].assign(self.time[n] + dt*(self.transform_index(n, from_range='slice', to_range='window') + 1))
+
+        self.t0 = fd.Constant(0.0)
         self.theta = theta
 
         self.form_mass = form_mass
         self.form_function = form_function
 
+        self.jacobian_mass = jacobian_mass if jacobian_mass is not None else form_mass
+        self.jacobian_function = jacobian_function if jacobian_function is not None else form_function
+
         self.circ = circ
         self.alpha = alpha
         self.Circ = fd.Constant(0.0)
@@ -140,8 +322,7 @@ def __init__(self,
         self.w_recv = fd.Function(self.function_space)
         self.w_send = fd.Function(self.function_space)
 
-        self._set_aao_form()
-        self.jacobian = JacobianMatrix(self)
+        self.aao_form = self.construct_aao_form(self.w_all, self.w_recv)
 
     def set_boundary_conditions(self, bcs):
         """
@@ -343,7 +524,8 @@ def next_window(self, w1=None):
         # persistence forecast
         for i in range(self.nlocal_timesteps):
             self.set_field(i, self.initial_condition, index_range='slice')
-
+            self.time[i].assign(self.time[i] + self.dt*self.ntimesteps)
+        self.t0.assign(self.t0 + self.dt*self.ntimesteps)
         return
 
     @PETSc.Log.EventDecorator()
@@ -429,13 +611,34 @@ def _assemble_function(self, snes, X, Fvec):
         with self.F_all.dat.vec_ro as v:
             v.copy(Fvec)
 
-    def _set_aao_form(self):
+    def construct_aao_form(self, wall=None, wrecv=None, mass=None, function=None):
         """
         Constructs the bilinear form for the all at once system.
         Specific to the theta-centred Crank-Nicholson method
+
+        :arg wall: all-at-once function to construct the form over.
+            Defaults to the AllAtOnceSystem's.
+        :arg wrecv: last timestep from previous time slice.
+            Defaults to the AllAtOnceSystem's.
+        :arg mass: a function that returns a linear form on w0.function_space()
+            providing the mass operator for the time derivative.
+        :arg function: a function that returns a form on w0.function_space()
+            providing f(w) for the ODE w_t + f(w) = 0.
         """
+        if wall is None:
+            w_alls = fd.split(self.w_all)
+        else:
+            w_alls = fd.split(wall)
+
+        if wrecv is None:
+            wrecv = self.w_recv
+
+        if mass is None:
+            mass = self.form_mass
+
+        if function is None:
+            function = self.form_function
 
-        w_alls = fd.split(self.w_all)
         test_fns = fd.TestFunctions(self.function_space_all)
 
         dt = fd.Constant(self.dt)
@@ -449,7 +652,6 @@ def get_test(i):
             return self.get_field_components(i, f_alls=test_fns)
 
         for n in range(self.nlocal_timesteps):
-
             # previous time level
             if n == 0:
                 if self.time_rank == 0:
@@ -457,13 +659,13 @@ def get_test(i):
                     w0list = fd.split(self.initial_condition)
 
                     # circulant option for quasi-Jacobian
-                    wrecvlist = fd.split(self.w_recv)
+                    wrecvlist = fd.split(wrecv)
 
                     w0s = [w0list[i] + self.Circ*alpha*wrecvlist[i]
                            for i in range(self.ncomponents)]
                 else:
-                    # self.w_recv will contain the data from the previous slice
-                    w0s = fd.split(self.w_recv)
+                    # wrecv will contain the data from the previous slice
+                    w0s = fd.split(wrecv)
             else:
                 w0s = get_step(n-1)
 
@@ -473,13 +675,13 @@ def get_test(i):
 
             # time derivative
             if n == 0:
-                aao_form = (1.0/dt)*self.form_mass(*w1s, *dws)
+                aao_form = (1.0/dt)*mass(*w1s, *dws)
             else:
-                aao_form += (1.0/dt)*self.form_mass(*w1s, *dws)
-            aao_form -= (1.0/dt)*self.form_mass(*w0s, *dws)
+                aao_form += (1.0/dt)*mass(*w1s, *dws)
+            aao_form -= (1.0/dt)*mass(*w0s, *dws)
 
             # vector field
-            aao_form += theta*self.form_function(*w1s, *dws)
-            aao_form += (1-theta)*self.form_function(*w0s, *dws)
+            aao_form += theta*function(*w1s, *dws, self.time[n])
+            aao_form += (1.0 - theta)*function(*w0s, *dws, self.time[n]-dt)
 
-        self.aao_form = aao_form
+        return aao_form
diff --git a/asQ/common.py b/asQ/common.py
new file mode 100644
index 00000000..b44cca6b
--- /dev/null
+++ b/asQ/common.py
@@ -0,0 +1,9 @@
+from firedrake.petsc import PETSc
+
+
+def get_option_from_list(option_name, option_list, default_index=None):
+    default = option_list[default_index] if default_index is not None else None
+    option = PETSc.Options().getString(option_name, default=default)
+    if option not in option_list:
+        raise ValueError(f"{option} must be one of "+" or ".join(option_list))
+    return option
diff --git a/asQ/diag_preconditioner.py b/asQ/diag_preconditioner.py
index 457edad8..902a4625 100644
--- a/asQ/diag_preconditioner.py
+++ b/asQ/diag_preconditioner.py
@@ -6,6 +6,10 @@
 from asQ.pencil import Pencil, Subcomm
 import importlib
 from asQ.profiling import memprofile
+from asQ.common import get_option_from_list
+from asQ.allatoncesystem import time_average
+
+from functools import partial
 
 import asQ.complex_proxy.vector as cpx
 
@@ -69,6 +73,36 @@ def construct_riesz_map(W, prefix, fieldsplit=False, riesz_options=None):
 
 
 class DiagFFTPC(object):
+    """
+    PETSc options:
+
+    'diagfft_linearisation': <'consistent', 'user'>
+        Which form to linearise when constructing the block Jacobians.
+        Default is 'consistent'.
+
+        'consistent': use the same form used in the AllAtOnceSystem residual.
+        'user': use the alternative forms given to the AllAtOnceSystem.
+
+    'diagfft_state': <'window', 'slice', 'linear', 'initial', 'reference'>
+        Which state to linearise around when constructing the block Jacobians.
+        Default is 'window'.
+
+        'window': use the time average over the entire AllAtOnceSystem.
+        'slice': use the time average over timesteps on the local Ensemble member.
+        'linear': the form linearised is already linear, so no update to the state is needed
+        'initial': use the initial condition is used for all timesteps.
+        'reference': use the reference state of the AllAtOnceSystem for all timesteps.
+
+    'diagfft_mass': <LinearSolver options>
+        The solver options for the Riesz map.
+        Default is {'pc_type': 'lu'}
+        Use 'diagfft_mass_fieldsplit' if the single-timestep function space is mixed.
+
+    'diagfft_block_%d': <LinearVariationalSolver options>
+        Default is the Firedrake default options.
+        The solver options for the %d'th block, enumerated globally.
+        Use 'diagfft_block' to set options for all blocks.
+    """
     prefix = "diagfft_"
 
     def __init__(self):
@@ -119,13 +153,15 @@ def initialize(self, pc):
         paradiag.diagfftpc = self
 
         # option for whether to use slice or window average for block jacobian
-        self.jac_average = PETSc.Options().getString(
-            f"{prefix}{self.prefix}jac_average", default='window')
+        valid_jac_state = ['window', 'slice', 'linear', 'initial', 'reference']
+        jac_option = f"{prefix}{self.prefix}state"
 
-        valid_jac_averages = ['window', 'slice']
+        self.jac_state = partial(get_option_from_list,
+                                 jac_option, valid_jac_state, default_index=0)
+        jac_state = self.jac_state()
 
-        if self.jac_average not in valid_jac_averages:
-            raise ValueError("diagfft_jac_average must be one of "+" or ".join(valid_jac_averages))
+        if jac_state == 'reference' and self.aaos.reference_state is None:
+            raise ValueError("AllAtOnceSystem must be provided a reference state to use \'reference\' for diagfft_jac_state.")
 
         # this time slice part of the all at once solution
         self.w_all = self.aaos.w_all
@@ -150,6 +186,7 @@ def initialize(self, pc):
         C2col = np.zeros(self.ntimesteps)
 
         dt = self.aaos.dt
+        self.t_average = fd.Constant(self.aaos.t0 + (self.ntimesteps + 1)*dt/2)
         theta = self.aaos.theta
         C1col[:2] = np.array([1, -1])/dt
         C2col[:2] = np.array([theta, 1-theta])
@@ -168,7 +205,7 @@ def initialize(self, pc):
                                                           bc, 0*bc.function_arg)))
 
         # function to do global reduction into for average block jacobian
-        if self.jac_average == 'window':
+        if jac_state in ('window', 'slice'):
             self.ureduce = fd.Function(self.function_space)
             self.uwrk = fd.Function(self.function_space)
 
@@ -225,8 +262,23 @@ def _make_block(self, i, prefix):
         # parts and then linearising.
         # This is constructed by cpx.derivative
 
-        form_mass = self.aaos.form_mass
-        form_function = self.aaos.form_function
+        #  Building the nonlinear operator
+        self.Jsolvers = []
+
+        # which form to linearise around
+        valid_linearisations = ['consistent', 'user']
+        linear_option = f"{prefix}{self.prefix}linearisation"
+
+        linear = get_option_from_list(linear_option, valid_linearisations, default_index=0)
+
+        if linear == 'consistent':
+            form_mass = self.aaos.form_mass
+            form_function = self.aaos.form_function
+        elif linear == 'user':
+            form_mass = self.aaos.jacobian_mass
+            form_function = self.aaos.jacobian_function
+
+        form_function = partial(form_function, t=self.t_average)
 
         ii = self.aaos.transform_index(i, from_range='slice', to_range='window')
         d1 = self.D1[ii]
@@ -291,22 +343,26 @@ def update(self, pc):
         an operator that is block diagonal in the 2x2 system coupling
         real and imaginary parts.
         '''
-        self.ureduce.assign(0.)
-
-        urs = self.ureduce.subfunctions
-        for i in range(self.nlocal_timesteps):
-            for ur, ui in zip(urs, self.aaos.get_field_components(i)):
-                ur.assign(ur + ui)
-
-        # average only over current time-slice
-        if self.jac_average == 'slice':
-            self.ureduce /= fd.Constant(self.nlocal_timesteps)
-        else:  # implies self.jac_average == 'window':
-            self.paradiag.ensemble.allreduce(self.ureduce, self.uwrk)
-            self.ureduce.assign(self.uwrk/fd.Constant(self.ntimesteps))
-
-        cpx.set_real(self.u0, self.ureduce)
-        cpx.set_imag(self.u0, self.ureduce)
+        jac_state = self.jac_state()
+        if jac_state == 'linear':
+            return
+
+        elif jac_state == 'initial':
+            ustate = self.aaos.initial_condition
+
+        elif jac_state == 'reference':
+            ustate = self.aaos.reference_state
+
+        elif jac_state in ('window', 'slice'):
+            time_average(self.aaos, self.ureduce, self.uwrk, average=jac_state)
+            ustate = self.ureduce
+
+        cpx.set_real(self.u0, ustate)
+        cpx.set_imag(self.u0, ustate)
+
+        self.t_average.assign(self.aaos.t0 + (self.aaos.ntimesteps + 1)*self.aaos.dt/2)
+
+        return
 
     @PETSc.Log.EventDecorator()
     @memprofile
diff --git a/asQ/paradiag.py b/asQ/paradiag.py
index 1a7f2fd5..20c81dde 100644
--- a/asQ/paradiag.py
+++ b/asQ/paradiag.py
@@ -4,7 +4,7 @@
 from functools import partial
 from .profiling import memprofile
 
-from asQ.allatoncesystem import AllAtOnceSystem
+from asQ.allatoncesystem import AllAtOnceSystem, JacobianMatrix
 from asQ.parallel_arrays import SharedArray
 
 appctx = {}
@@ -39,47 +39,61 @@ def create_ensemble(time_partition, comm=fd.COMM_WORLD):
 class paradiag(object):
     @memprofile
     def __init__(self, ensemble,
-                 form_function, form_mass, w0, dt, theta,
+                 form_function, form_mass,
+                 w0, dt, theta,
                  alpha, time_partition, bcs=[],
                  solver_parameters={},
-                 circ="picard",
+                 reference_state=None,
+                 jacobian_function=None,
+                 jacobian_mass=None,
+                 circ="",
                  tol=1.0e-6, maxits=10,
                  ctx={}, block_ctx={}, block_mat_type="aij"):
         """A class to implement paradiag timestepping.
 
         :arg ensemble: the ensemble communicator
-        :arg form_function: a function that returns a linear form
-        on w0.function_space() providing f(w) for the ODE w_t + f(w) = 0.
-        :arg form_mass: a function that returns a linear form
-        on W providing the mass operator for the time derivative.
-        :arg w0: a Function from W containing the initial data
+        :arg form_function: a function that returns a form
+            on w0.function_space() providing f(w) for the ODE w_t + f(w) = 0.
+        :arg form_mass: a function that returns a linear form on
+            w0.function_space providing the mass operator for the time derivative.
+        :arg jacobian_function: a function that returns a form
+            on w0.function_space() which will be linearised to approximate
+            derivative(f(w), w) for the ODE w_t + f(w) = 0.
+        :arg jacobian_mass: a function that returns a linear form on w0.function_space
+            providing which will be linearised to approximate the form_mass
+        :arg w0: a Function from w0.function_space containing the initial data.
         :arg dt: float, the timestep size.
-        :arg theta: float, implicit timestepping parameter
-        :arg alpha: float, circulant matrix parameter
+        :arg theta: float, implicit timestepping parameter.
+        :arg alpha: float, circulant matrix parameter.
         :arg time_partition: a list of integers, the number of timesteps
-        assigned to each rank
-        :arg bcs: a list of DirichletBC boundary conditions on W
-        :arg solver_parameters: options dictionary for nonlinear solver
+            assigned to each rank.
+        :arg bcs: a list of DirichletBC boundary conditions on w0.function_space.
+        :arg solver_parameters: options dictionary for nonlinear solver.
+        :arg reference_state: a Function in W to use as a reference state
+            e.g. in DiagFFTPC.
         :arg circ: a string describing the option on where to use the
-        alpha-circulant modification. "picard" - do a nonlinear wave
-        form relaxation method. "quasi" - do a modified Newton
-        method with alpha-circulant modification added to the
-        Jacobian. To make the alpha circulant modification only in the
-        preconditioner, simply set ksp_type:preonly in the solve options.
+            alpha-circulant modification. "picard" - do a nonlinear wave
+            form relaxation method. "quasi" - do a modified Newton
+            method with alpha-circulant modification added to the
+            Jacobian. To make the alpha circulant modification only in the
+            preconditioner, simply set ksp_type:preonly in the solve options.
         :arg tol: float, the tolerance for the relaxation method (if used)
         :arg maxits: integer, the maximum number of iterations for the
-        relaxation method, if used.
+            relaxation method, if used.
         :arg ctx: application context for solvers.
         :arg block_ctx: non-petsc context for solvers.
         :arg block_mat_type: set the type of the diagonal block systems.
-        Default is aij.
+            Default is aij.
         """
 
         self.aaos = AllAtOnceSystem(ensemble, time_partition,
                                     dt, theta,
                                     form_mass, form_function,
-                                    w0, bcs,
-                                    circ, alpha)
+                                    w0, bcs=bcs,
+                                    reference_state=reference_state,
+                                    jacobian_function=jacobian_function,
+                                    jacobian_mass=jacobian_mass,
+                                    circ=circ, alpha=alpha)
 
         self.ensemble = ensemble
         self.layout = self.aaos.layout
@@ -120,13 +134,21 @@ def __init__(self, ensemble,
         self.snes = PETSc.SNES().create(comm=ensemble.global_comm)
         self.opts = OptionsManager(self.flat_solver_parameters, '')
         self.snes.setOptionsPrefix('')
-        self.snes.setFunction(self.aaos._assemble_function, self.F)
 
-        # set up the Jacobian
+        # set up the residual function
+        def assemble_function(snes, X, Fvec):
+            self.aaos._assemble_function(snes, X, Fvec)
+
+        self.snes.setFunction(assemble_function, self.F)
+
+        # set up the Jacobian using the provided options
+        with self.opts.inserted_options():
+            self.jacobian = JacobianMatrix(self.aaos, self.snes)
+
         Jacmat = PETSc.Mat().create(comm=ensemble.global_comm)
         Jacmat.setType("python")
         Jacmat.setSizes(((nlocal, nglobal), (nlocal, nglobal)))
-        Jacmat.setPythonContext(self.aaos.jacobian)
+        Jacmat.setPythonContext(self.jacobian)
         Jacmat.setUp()
         self.Jacmat = Jacmat
 
@@ -134,6 +156,7 @@ def form_jacobian(snes, X, J, P):
             # copy the snes state vector into self.X
             X.copy(self.X)
             self.aaos.update(X)
+            self.jacobian.update()
             J.assemble()
             P.assemble()
 
@@ -192,11 +215,14 @@ def solve(self,
         """
 
         for wndw in range(nwindows):
-
             preproc(self, wndw)
 
+            with self.aaos.w_all.dat.vec_ro as v:
+                v.copy(self.X)
+
             with self.opts.inserted_options():
                 self.snes.solve(None, self.X)
+
             self.aaos.update(self.X)
 
             self._record_diagnostics()
@@ -204,8 +230,10 @@ def solve(self,
             postproc(self, wndw)
 
             converged_reason = self.snes.getConvergedReason()
-            is_linear = ('snes_type' in self.flat_solver_parameters
-                         and self.flat_solver_parameters['snes_type'] == 'ksponly')
+            is_linear = (
+                'snes_type' in self.flat_solver_parameters
+                and self.flat_solver_parameters['snes_type'] == 'ksponly'
+            )
             if is_linear and (converged_reason == 5):
                 pass
             elif not (1 < converged_reason < 5):
diff --git a/examples/advection/periodic_dg_advection.py b/examples/advection/periodic_dg_advection.py
index c7a00ccc..8326698c 100644
--- a/examples/advection/periodic_dg_advection.py
+++ b/examples/advection/periodic_dg_advection.py
@@ -1,4 +1,3 @@
-
 import matplotlib.pyplot as plt
 from matplotlib.animation import FuncAnimation
 from math import pi, cos, sin
@@ -92,7 +91,7 @@ def form_mass(q, phi):
 # The DG advection form for the scalar advection equation.
 # asQ assumes that the function form is nonlinear so here
 # q is a Function and phi is a TestFunction
-def form_function(q, phi):
+def form_function(q, phi, t):
     # upwind switch
     n = fd.FacetNormal(mesh)
     un = 0.5*(fd.dot(u, n) + abs(fd.dot(u, n)))
@@ -112,8 +111,8 @@ def form_function(q, phi):
 # The PETSc solver parameters used to solve the
 # blocks in step (b) of inverting the ParaDiag matrix.
 block_parameters = {
-    'ksp_type': 'gmres',
-    'pc_type': 'bjacobi',
+    'ksp_type': 'preonly',
+    'pc_type': 'lu',
 }
 
 # The PETSc solver parameters for solving the all-at-once system.
@@ -131,22 +130,18 @@ def form_function(q, phi):
 #    'ksp_type': 'preonly'
 
 paradiag_parameters = {
+    'snes_type': 'ksponly',
     'snes': {
-        'linesearch_type': 'basic',
         'monitor': None,
         'converged_reason': None,
         'rtol': 1e-10,
-        'atol': 1e-12,
-        'stol': 1e-12,
     },
     'mat_type': 'matfree',
-    'ksp_type': 'preonly',
+    'ksp_type': 'richardson',
     'ksp': {
         'monitor': None,
         'converged_reason': None,
         'rtol': 1e-10,
-        'atol': 1e-12,
-        'stol': 1e-12,
     },
     'pc_type': 'python',
     'pc_python_type': 'asQ.DiagFFTPC'
diff --git a/examples/advection/unsteady_dg_advection.py b/examples/advection/unsteady_dg_advection.py
new file mode 100644
index 00000000..1f760df4
--- /dev/null
+++ b/examples/advection/unsteady_dg_advection.py
@@ -0,0 +1,256 @@
+import matplotlib.pyplot as plt
+from matplotlib.animation import FuncAnimation
+from math import pi, cos, sin
+
+import firedrake as fd
+from firedrake.petsc import PETSc
+import asQ
+
+import argparse
+
+parser = argparse.ArgumentParser(
+    description='ParaDiag timestepping for scalar advection of a Gaussian bump in a periodic square with DG in space and implicit-theta in time.\n'
+                +'Advecting velocity varies in time sinusoidally.'
+                +'Based on the Firedrake DG advection example https://www.firedrakeproject.org/demos/DG_advection.py.html',
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+parser.add_argument('--nx', type=int, default=64, help='Number of cells along each square side.')
+parser.add_argument('--cfl', type=float, default=0.8, help='Convective CFL number.')
+parser.add_argument('--angle', type=float, default=pi/6, help='Angle of the convective velocity.')
+parser.add_argument('--degree', type=int, default=1, help='Degree of the scalar and velocity spaces.')
+parser.add_argument('--theta', type=float, default=0.5, help='Parameter for the implicit theta timestepping method.')
+parser.add_argument('--width', type=float, default=0.1, help='Width of the Gaussian bump.')
+parser.add_argument('--nwindows', type=int, default=1, help='Number of time-windows.')
+parser.add_argument('--nslices', type=int, default=2, help='Number of time-slices per time-window.')
+parser.add_argument('--slice_length', type=int, default=2, help='Number of timesteps per time-slice.')
+parser.add_argument('--alpha', type=float, default=0.0001, help='Circulant coefficient.')
+parser.add_argument('--nsample', type=int, default=32, help='Number of sample points for plotting.')
+parser.add_argument('--show_args', action='store_true', help='Output all the arguments.')
+
+args = parser.parse_known_args()
+args = args[0]
+
+if args.show_args:
+    PETSc.Sys.Print(args)
+
+# The time partition describes how many timesteps are included on each time-slice of the ensemble
+# Here we use the same number of timesteps on each slice, but they can be different
+
+time_partition = tuple(args.slice_length for _ in range(args.nslices))
+window_length = sum(time_partition)
+nsteps = args.nwindows*window_length
+
+# Calculate the timestep from the CFL number
+umax = 1.
+dx = 1./args.nx
+dt = args.cfl*dx/umax
+
+# timescale of the domain
+T = 1./umax
+
+period = 0.5*T
+
+# The Ensemble with the spatial and time communicators
+ensemble = asQ.create_ensemble(time_partition)
+
+# # # === --- domain --- === # # #
+
+# The mesh needs to be created with the spatial communicator
+mesh = fd.PeriodicUnitSquareMesh(args.nx, args.nx, quadrilateral=True, comm=ensemble.comm)
+
+# We use a discontinuous Galerkin space for the advected scalar
+# and a continuous Galerkin space for the advecting velocity field
+V = fd.FunctionSpace(mesh, "DQ", args.degree)
+W = fd.VectorFunctionSpace(mesh, "CG", args.degree)
+
+# # # === --- initial conditions --- === # # #
+
+x, y = fd.SpatialCoordinate(mesh)
+
+
+def radius(x, y):
+    return fd.sqrt(pow(x-0.5, 2) + pow(y-0.5, 2))
+
+
+def gaussian(x, y):
+    return fd.exp(-0.5*pow(radius(x, y)/args.width, 2))
+
+
+# The scalar initial conditions are a Gaussian bump centred at (0.5, 0.5)
+q0 = fd.Function(V, name="scalar_initial")
+q0.interpolate(1 + gaussian(x, y))
+
+# The advecting velocity field is constant and directed at an angle to the x-axis
+u = fd.Function(W, name='velocity')
+u.interpolate(0.5*fd.as_vector((umax*cos(args.angle), umax*sin(args.angle))))
+
+# time-varying perturbation to advecting velocity
+up = fd.Function(W, name='velocity-perturbation')
+up.interpolate(fd.as_vector((1.0*umax, -1.0*umax)))
+
+# # # === --- finite element forms --- === # # #
+
+
+# The time-derivative mass form for the scalar advection equation.
+# asQ assumes that the mass form is linear so here
+# q is a TrialFunction and phi is a TestFunction
+def form_mass(q, phi):
+    return phi*q*fd.dx
+
+
+# The DG advection form for the scalar advection equation.
+# asQ assumes that the function form is nonlinear so here
+# q is a Function and phi is a TestFunction
+def form_function(q, phi, t):
+    # upwind switch
+    v = u + up*fd.cos(2*pi*t/T)
+    n = fd.FacetNormal(mesh)
+    un = 0.5*(fd.dot(v, n) + abs(fd.dot(v, n)))
+
+    # integration over element volume
+    int_cell = q*fd.div(phi*v)*fd.dx
+
+    # integration over internal facets
+    int_facet = (phi('+')-phi('-'))*(un('+')*q('+')-un('-')*q('-'))*fd.dS
+
+    return int_facet - int_cell
+
+
+# # # === --- PETSc solver parameters --- === # # #
+
+
+# The PETSc solver parameters used to solve the
+# blocks in step (b) of inverting the ParaDiag matrix.
+block_parameters = {
+    'ksp_type': 'preonly',
+    'pc_type': 'lu',
+}
+
+# The PETSc solver parameters for solving the all-at-once system.
+# The python preconditioner 'asQ.DiagFFTPC' applies the ParaDiag matrix.
+#
+# The equation is linear so we can either:
+# a) Solve it in one shot using a preconditioned Krylov method:
+#    P^{-1}Au = P^{-1}b
+#    The solver options for this are:
+#    'ksp_type': 'fgmres'
+#    We need fgmres here because gmres is used on the blocks.
+# b) Solve it with Picard iterations:
+#    Pu_{k+1} = (P - A)u_{k} + b
+#    The solver options for this are:
+#    'ksp_type': 'preonly'
+
+paradiag_parameters = {
+    'snes_type': 'ksponly',
+    'snes': {
+        'monitor': None,
+        'converged_reason': None,
+        'rtol': 1e-8,
+    },
+    'mat_type': 'matfree',
+    'ksp_type': 'gmres',
+    'ksp': {
+        'monitor': None,
+        'converged_reason': None,
+        'rtol': 1e-8,
+    },
+    'pc_type': 'python',
+    'pc_python_type': 'asQ.DiagFFTPC'
+}
+
+# We need to add a block solver parameters dictionary for each block.
+# Here they are all the same but they could be different.
+for i in range(window_length):
+    paradiag_parameters['diagfft_block_'+str(i)+'_'] = block_parameters
+
+
+# # # === --- Setup ParaDiag --- === # # #
+
+
+# Give everything to asQ to create the paradiag object.
+# the circ parameter determines where the alpha-circulant
+# approximation is introduced. None means only in the preconditioner.
+pdg = asQ.paradiag(ensemble=ensemble,
+                   form_function=form_function,
+                   form_mass=form_mass,
+                   w0=q0, dt=dt, theta=args.theta,
+                   alpha=args.alpha, time_partition=time_partition,
+                   solver_parameters=paradiag_parameters,
+                   circ=None)
+
+
+# This is a callback which will be called before pdg solves each time-window
+# We can use this to make the output a bit easier to read
+def window_preproc(pdg, wndw):
+    PETSc.Sys.Print('')
+    PETSc.Sys.Print(f'### === --- Calculating time-window {wndw} --- === ###')
+    PETSc.Sys.Print('')
+
+
+# The last time-slice will be saving snapshots to create an animation.
+# The layout member describes the time_partition.
+# layout.is_local(i) returns True/False if the timestep index i is on the
+# current time-slice. Here we use -1 to mean the last timestep in the window.
+is_last_slice = pdg.layout.is_local(-1)
+
+# Make an output Function on the last time-slice and start a snapshot list
+if is_last_slice:
+    qout = fd.Function(V)
+    timeseries = [q0.copy(deepcopy=True)]
+
+
+# This is a callback which will be called after pdg solves each time-window
+# We can use this to save the last timestep of each window for plotting.
+def window_postproc(pdg, wndw):
+    if is_last_slice:
+        # The aaos is the AllAtOnceSystem which represents the time-dependent problem.
+        # get_field extracts one timestep of the window. -1 is again used to get the last
+        # timestep and place it in qout.
+        pdg.aaos.get_field(-1, index_range='window', wout=qout)
+        timeseries.append(qout.copy(deepcopy=True))
+
+
+# Solve nwindows of the all-at-once system
+pdg.solve(args.nwindows,
+          preproc=window_preproc,
+          postproc=window_postproc)
+
+
+# # # === --- Postprocessing --- === # # #
+
+# paradiag collects a few solver diagnostics for us to inspect
+nw = args.nwindows
+
+# Number of nonlinear iterations, total and per window.
+# (1 for fgmres and # picard iterations for preonly)
+PETSc.Sys.Print(f'nonlinear iterations: {pdg.nonlinear_iterations}  |  iterations per window: {pdg.nonlinear_iterations/nw}')
+
+# Number of linear iterations, total and per window.
+# (# of gmres iterations for fgmres and # picard iterations for preonly)
+PETSc.Sys.Print(f'linear iterations: {pdg.linear_iterations}  |  iterations per window: {pdg.linear_iterations/nw}')
+
+# Number of iterations needed for each block in step-(b), total and per block solve
+# The number of iterations for each block will usually be different because of the different eigenvalues
+PETSc.Sys.Print(f'block linear iterations: {pdg.block_iterations._data}  |  iterations per block solve: {pdg.block_iterations._data/pdg.linear_iterations}')
+
+# We can write these diagnostics to file, along with some other useful information.
+# Files written are: aaos_metrics.txt, block_metrics.txt, paradiag_setup.txt, solver_parameters.txt
+asQ.write_paradiag_metrics(pdg, directory='metrics')
+
+# Make an animation from the snapshots we collected and save it to periodic.mp4.
+if is_last_slice:
+
+    fn_plotter = fd.FunctionPlotter(mesh, num_sample_points=args.nsample)
+
+    fig, axes = plt.subplots()
+    axes.set_aspect('equal')
+    colors = fd.tripcolor(qout, num_sample_points=args.nsample, vmin=1, vmax=2, axes=axes)
+    fig.colorbar(colors)
+
+    def animate(q):
+        colors.set_array(fn_plotter(q))
+
+    interval = 4e2
+    animation = FuncAnimation(fig, animate, frames=timeseries, interval=interval)
+
+    animation.save("periodic.mp4", writer="ffmpeg")
diff --git a/examples/heat/heat.py b/examples/heat/heat.py
new file mode 100644
index 00000000..1fb795ce
--- /dev/null
+++ b/examples/heat/heat.py
@@ -0,0 +1,197 @@
+import firedrake as fd
+from firedrake.petsc import PETSc
+import asQ
+
+import argparse
+
+parser = argparse.ArgumentParser(
+    description='While we try to figure out how to implement time-dependent Dirichlet BCs, one can use Nitsche-type penalty method. Here, we consider Heat equatiion(u_t = Delta u) with boundary conditions match those of exp(1.25t + 0.5x + y)',
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+parser.add_argument('--nx', type=int, default=64, help='Number of cells along each square side.')
+parser.add_argument('--degree', type=int, default=1, help='Degree of the scalar and velocity spaces.')
+parser.add_argument('--theta', type=float, default=0.5, help='Parameter for the implicit theta timestepping method.')
+parser.add_argument('--nwindows', type=int, default=1, help='Number of time-windows.')
+parser.add_argument('--nslices', type=int, default=2, help='Number of time-slices per time-window.')
+parser.add_argument('--slice_length', type=int, default=2, help='Number of timesteps per time-slice.')
+parser.add_argument('--alpha', type=float, default=0.0001, help='Circulant coefficient.')
+parser.add_argument('--nsample', type=int, default=32, help='Number of sample points for plotting.')
+parser.add_argument('--show_args', action='store_true', help='Output all the arguments.')
+
+args = parser.parse_known_args()
+args = args[0]
+
+if args.show_args:
+    PETSc.Sys.Print(args)
+
+# The time partition describes how many timesteps are included on each time-slice of the ensemble
+# Here we use the same number of timesteps on each slice, but they can be different
+
+time_partition = tuple(args.slice_length for _ in range(args.nslices))
+window_length = sum(time_partition)
+nsteps = args.nwindows*window_length
+
+# Set the timesstep
+nx = args.nx
+dt = 1./nx
+
+# The Ensemble with the spatial and time communicators
+ensemble = asQ.create_ensemble(time_partition)
+
+# # # === --- domain --- === # # #
+
+# The mesh needs to be created with the spatial communicator
+mesh = fd.UnitSquareMesh(args.nx, args.nx, quadrilateral=False, comm=ensemble.comm)
+
+V = fd.FunctionSpace(mesh, "CG", args.degree)
+
+# # # === --- initial conditions --- === # # #
+# q_exact = exp(0.5x + y + 1.25t), u_t-\Deltau = 0.
+
+x, y = fd.SpatialCoordinate(mesh)
+n = fd.FacetNormal(mesh)
+# Initial conditions.
+w0 = fd.Function(V, name="scalar_initial")
+w0.interpolate(fd.exp(0.5*x + y))
+
+# # # === --- finite element forms --- === # # #
+
+
+# asQ assumes that the mass form is linear so here
+# q is a TrialFunction and phi is a TestFunction
+def form_mass(q, phi):
+    return phi*q*fd.dx
+
+
+# q is a Function and phi is a TestFunction
+def form_function(q, phi, t):
+    return fd.inner(fd.grad(q), fd.grad(phi))*fd.dx - fd.inner(phi, fd.inner(fd.grad(q), n))*fd.ds - fd.inner(q-fd.exp(0.5*x + y + 1.25*t), fd.inner(fd.grad(phi), n))*fd.ds + 20*nx*fd.inner(q-fd.exp(0.5*x + y + 1.25*t), phi)*fd.ds
+
+
+# # # === --- PETSc solver parameters --- === # # #
+
+
+# The PETSc solver parameters used to solve the
+# blocks in step (b) of inverting the ParaDiag matrix.
+block_parameters = {
+    'ksp_type': 'preonly',
+    'pc_type': 'lu',
+}
+
+# The PETSc solver parameters for solving the all-at-once system.
+# The python preconditioner 'asQ.DiagFFTPC' applies the ParaDiag matrix.
+#
+# The equation is linear so we can either:
+# a) Solve it in one shot using a preconditioned Krylov method:
+#    P^{-1}Au = P^{-1}b
+#    The solver options for this are:
+#    'ksp_type': 'fgmres'
+#    We need fgmres here because gmres is used on the blocks.
+# b) Solve it with Picard iterations:
+#    Pu_{k+1} = (P - A)u_{k} + b
+#    The solver options for this are:
+#    'ksp_type': 'preonly'
+
+paradiag_parameters = {
+    'snes': {
+        'linesearch_type': 'basic',
+        'monitor': None,
+        'converged_reason': None,
+        'rtol': 1e-10,
+        'atol': 1e-12,
+        'stol': 1e-12,
+    },
+    'mat_type': 'matfree',
+    'ksp_type': 'preonly',
+    'ksp': {
+        'monitor': None,
+        'converged_reason': None,
+        'rtol': 1e-10,
+        'atol': 1e-12,
+        'stol': 1e-12,
+    },
+    'pc_type': 'python',
+    'pc_python_type': 'asQ.DiagFFTPC'
+}
+
+# We need to add a block solver parameters dictionary for each block.
+# Here they are all the same but they could be different.
+for i in range(window_length):
+    paradiag_parameters['diagfft_block_'+str(i)+'_'] = block_parameters
+
+
+# # # === --- Setup ParaDiag --- === # # #
+
+
+# Give everything to asQ to create the paradiag object.
+# the circ parameter determines where the alpha-circulant
+# approximation is introduced. None means only in the preconditioner.
+pdg = asQ.paradiag(ensemble=ensemble,
+                   form_function=form_function,
+                   form_mass=form_mass,
+                   w0=w0, dt=dt, theta=args.theta,
+                   alpha=args.alpha, time_partition=time_partition,
+                   solver_parameters=paradiag_parameters,
+                   circ=None)
+
+
+# This is a callback which will be called before pdg solves each time-window
+# We can use this to make the output a bit easier to read
+def window_preproc(pdg, wndw):
+    PETSc.Sys.Print('')
+    PETSc.Sys.Print(f'### === --- Calculating time-window {wndw} --- === ###')
+    PETSc.Sys.Print('')
+
+
+# We find the L2-error at each timestep
+q_exact = fd.Function(V)
+qp = fd.Function(V)
+errors = asQ.SharedArray(time_partition, comm=ensemble.ensemble_comm)
+times = asQ.SharedArray(time_partition, comm=ensemble.ensemble_comm)
+
+
+def window_postproc(pdg, wndw):
+    aaos = pdg.aaos
+
+    for step in range(aaos.ntimesteps):
+        if aaos.layout.is_local(step):
+            local_step = aaos.transform_index(step, from_range='window')
+            t = aaos.time[local_step]
+            q_exact.interpolate(fd.exp(.5*x + y + 1.25*t))
+            aaos.get_field(local_step, wout=qp)
+            errors.dlocal[local_step] = fd.errornorm(qp, q_exact)
+            times.dlocal[local_step] = t
+
+    errors.synchronise()
+    times.synchronise()
+
+    for step in range(aaos.ntimesteps):
+        PETSc.Sys.Print(f"Time={str(times.dglobal[step]).ljust(8, ' ')}, qerr={errors.dglobal[step]}")
+
+
+# Solve nwindows of the all-at-once system
+pdg.solve(args.nwindows,
+          preproc=window_preproc,
+          postproc=window_postproc)
+
+
+# # # === --- Postprocessing --- === # # #
+
+# paradiag collects a few solver diagnostics for us to inspect
+nw = args.nwindows
+
+# Number of nonlinear iterations, total and per window.
+# (1 for fgmres and # picard iterations for preonly)
+PETSc.Sys.Print(f'nonlinear iterations: {pdg.nonlinear_iterations}  |  iterations per window: {pdg.nonlinear_iterations/nw}')
+
+# Number of linear iterations, total and per window.
+# (# of gmres iterations for fgmres and # picard iterations for preonly)
+PETSc.Sys.Print(f'linear iterations: {pdg.linear_iterations}  |  iterations per window: {pdg.linear_iterations/nw}')
+
+# Number of iterations needed for each block in step-(b), total and per block solve
+# The number of iterations for each block will usually be different because of the different eigenvalues
+PETSc.Sys.Print(f'block linear iterations: {pdg.block_iterations._data}  |  iterations per block solve: {pdg.block_iterations._data/pdg.linear_iterations}')
+
+# We can write these diagnostics to file, along with some other useful information.
+# Files written are: aaos_metrics.txt, block_metrics.txt, paradiag_setup.txt, solver_parameters.txt
+asQ.write_paradiag_metrics(pdg)
diff --git a/examples/heat/mms.py b/examples/heat/mms.py
new file mode 100644
index 00000000..d3a246b9
--- /dev/null
+++ b/examples/heat/mms.py
@@ -0,0 +1,200 @@
+from math import pi
+import firedrake as fd
+from firedrake.petsc import PETSc
+import numpy as np
+import asQ
+
+import argparse
+
+parser = argparse.ArgumentParser(
+    description='ParaDiag timestepping for a linear equation with time-dpendent coefficient. Here, we use the MMS to solve u_t - (1+2sin(pi x) sin(pi y)) Delta u = f over the domain, Omega = [0,1]^2 with Dirichlet BCs $',
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+parser.add_argument('--nx', type=int, default=64, help='Number of cells along each square side.')
+parser.add_argument('--degree', type=int, default=1, help='Degree of the scalar and velocity spaces.')
+parser.add_argument('--theta', type=float, default=0.5, help='Parameter for the implicit theta timestepping method.')
+parser.add_argument('--nwindows', type=int, default=1, help='Number of time-windows.')
+parser.add_argument('--nslices', type=int, default=2, help='Number of time-slices per time-window.')
+parser.add_argument('--slice_length', type=int, default=2, help='Number of timesteps per time-slice.')
+parser.add_argument('--alpha', type=float, default=0.0001, help='Circulant coefficient.')
+parser.add_argument('--nsample', type=int, default=32, help='Number of sample points for plotting.')
+parser.add_argument('--show_args', action='store_true', help='Output all the arguments.')
+
+args = parser.parse_known_args()
+args = args[0]
+
+if args.show_args:
+    PETSc.Sys.Print(args)
+
+# The time partition describes how many timesteps are included on each time-slice of the ensemble
+# Here we use the same number of timesteps on each slice, but they can be different
+
+time_partition = tuple(args.slice_length for _ in range(args.nslices))
+window_length = sum(time_partition)
+nsteps = args.nwindows*window_length
+
+# Set the timesstep
+dx = 1./args.nx
+dt = dx
+
+# The Ensemble with the spatial and time communicators
+ensemble = asQ.create_ensemble(time_partition)
+
+# # # === --- domain --- === # # #
+
+# The mesh needs to be created with the spatial communicator
+mesh = fd.UnitSquareMesh(args.nx, args.nx, quadrilateral=False, comm=ensemble.comm)
+
+V = fd.FunctionSpace(mesh, "CG", args.degree)
+
+# # # === --- initial conditions --- === # # #
+
+x, y = fd.SpatialCoordinate(mesh)
+
+# We use the method of manufactured solutions, prescribing a Rhs, initial and boundary data to exactly match those of a known solution.
+u_exact = fd.sin(pi*x)*fd.cos(pi*y)
+
+
+def time_coef(t):
+    return 2 + fd.sin(2*pi*t)
+
+
+def Rhs(t):
+    # As our exact solution is independent of t, the Rhs is just $-(2 + \sin(2*pi*t)) \Delta u$
+    return -(2 + fd.sin(2*pi*t))*fd.div(fd.grad(u_exact))
+
+
+# Initial conditions.
+w0 = fd.Function(V, name="scalar_initial")
+w0.interpolate(u_exact)
+# Dirichlet BCs
+bcs = [fd.DirichletBC(V, u_exact, 'on_boundary')]
+
+
+# # # === --- finite element forms --- === # # #
+
+
+# asQ assumes that the mass form is linear so here
+# q is a TrialFunction and phi is a TestFunction
+def form_mass(q, phi):
+    return phi*q*fd.dx
+
+
+# q is a Function and phi is a TestFunction
+def form_function(q, phi, t):
+    return time_coef(t)*fd.inner(fd.grad(q), fd.grad(phi))*fd.dx - fd.inner(Rhs(t), phi)*fd.dx
+
+
+# # # === --- PETSc solver parameters --- === # # #
+
+
+# The PETSc solver parameters used to solve the
+# blocks in step (b) of inverting the ParaDiag matrix.
+block_parameters = {
+    'ksp_type': 'gmres',
+    'pc_type': 'bjacobi',
+}
+
+# The PETSc solver parameters for solving the all-at-once system.
+# The python preconditioner 'asQ.DiagFFTPC' applies the ParaDiag matrix.
+#
+# The equation is linear so we can either:
+# a) Solve it in one shot using a preconditioned Krylov method:
+#    P^{-1}Au = P^{-1}b
+#    The solver options for this are:
+#    'ksp_type': 'fgmres'
+#    We need fgmres here because gmres is used on the blocks.
+# b) Solve it with Picard iterations:
+#    Pu_{k+1} = (P - A)u_{k} + b
+#    The solver options for this are:
+#    'ksp_type': 'preonly'
+
+paradiag_parameters = {
+    'snes': {
+        'linesearch_type': 'basic',
+        'monitor': None,
+        'converged_reason': None,
+        'rtol': 1e-10,
+        'atol': 1e-12,
+        'stol': 1e-12,
+    },
+    'mat_type': 'matfree',
+    'ksp_type': 'preonly',
+    'ksp': {
+        'monitor': None,
+        'converged_reason': None,
+        'rtol': 1e-10,
+        'atol': 1e-12,
+        'stol': 1e-12,
+    },
+    'pc_type': 'python',
+    'pc_python_type': 'asQ.DiagFFTPC'
+}
+
+# We need to add a block solver parameters dictionary for each block.
+# Here they are all the same but they could be different.
+for i in range(window_length):
+    paradiag_parameters['diagfft_block_'+str(i)+'_'] = block_parameters
+
+
+# # # === --- Setup ParaDiag --- === # # #
+
+
+# Give everything to asQ to create the paradiag object.
+# the circ parameter determines where the alpha-circulant
+# approximation is introduced. None means only in the preconditioner.
+PD = asQ.paradiag(ensemble=ensemble,
+                  form_function=form_function,
+                  form_mass=form_mass,
+                  w0=w0, dt=dt, theta=args.theta,
+                  alpha=args.alpha, time_partition=time_partition, bcs=bcs,
+                  solver_parameters=paradiag_parameters,
+                  circ=None)
+
+
+# This is a callback which will be called before pdg solves each time-window
+# We can use this to make the output a bit easier to read
+def window_preproc(pdg, wndw):
+    PETSc.Sys.Print('')
+    PETSc.Sys.Print(f'### === --- Calculating time-window {wndw} --- === ###')
+    PETSc.Sys.Print('')
+
+
+qcheck = w0.copy(deepcopy=True)
+
+
+def Exact(u_exact):
+    q_err = fd.errornorm(u_exact, qcheck)
+    return q_err
+
+
+def window_postproc(pdg, wndw):
+    errors = np.zeros((window_length, 1))
+    local_errors = np.zeros_like(errors)
+
+    # collect errors for this slice
+    def for_each_callback(window_index, slice_index, w):
+        nonlocal local_errors
+        local_errors[window_index] = Exact(w)
+    pdg.aaos.for_each_timestep(for_each_callback)
+
+    def for_each_callback(window_index, slice_index, w):
+        nonlocal local_errors
+        local_errors[window_index, :] = Exact(w)
+
+    pdg.aaos.for_each_timestep(for_each_callback)
+
+    # collect and print errors for full window
+    ensemble.ensemble_comm.Reduce(local_errors, errors, root=0)
+    if pdg.time_rank == 0:
+        for window_index in range(window_length):
+            timestep = wndw*window_length + window_index
+            q_err = errors[window_index, 0]
+            PETSc.Sys.Print(f"timestep={timestep}, q_err={q_err}",
+                            comm=ensemble.comm)
+
+
+# Solve nwindows of the all-at-once system
+PD.solve(args.nwindows,
+         preproc=window_preproc,
+         postproc=window_postproc)
diff --git a/examples/serial/shallow_water/galewsky_comparison.py b/examples/serial/shallow_water/galewsky_comparison.py
index 2c6788c2..555f18b7 100644
--- a/examples/serial/shallow_water/galewsky_comparison.py
+++ b/examples/serial/shallow_water/galewsky_comparison.py
@@ -203,7 +203,7 @@ def form_mass(u, h, v, q):
 for _ in range(time_partition[ensemble.ensemble_comm.rank]):
     tm = mg.manifold_transfer_manager(W)
     transfer_managers.append(tm)
-block_ctx['diag_transfer_managers'] = transfer_managers
+block_ctx['diagfft_transfer_managers'] = transfer_managers
 
 miniapp = ComparisonMiniapp(ensemble, time_partition,
                             form_mass,
diff --git a/examples/serial/shallow_water/galewsky_serial.py b/examples/serial/shallow_water/galewsky_serial.py
index 1cd5d8ce..6c5ab1dd 100644
--- a/examples/serial/shallow_water/galewsky_serial.py
+++ b/examples/serial/shallow_water/galewsky_serial.py
@@ -89,13 +89,17 @@ def form_mass(u, h, v, q):
         'monitor': None,
         'converged_reason': None,
         'rtol': 1e-12,
-        'atol': 1e-0
+        'atol': 1e-0,
+        'ksp_ew': None,
+        'ksp_ew_version': 1,
     },
     'mat_type': 'matfree',
     'ksp_type': 'fgmres',
     'ksp': {
         'monitor': None,
-        'converged_reason': None
+        'converged_reason': None,
+        'atol': 1e-5,
+        'rtol': 1e-5,
     },
     'pc_type': 'mg',
     'pc_mg_cycle_type': 'w',
@@ -110,7 +114,7 @@ def form_mass(u, h, v, q):
                 'pc_patch_save_operators': True,
                 'pc_patch_partition_of_unity': True,
                 'pc_patch_sub_mat_type': 'seqdense',
-                'pc_patch_construct_codim': 0,
+                'pc_patch_construct_dim': 0,
                 'pc_patch_construct_type': 'vanka',
                 'pc_patch_local_type': 'additive',
                 'pc_patch_precompute_element_tensors': True,
@@ -168,8 +172,8 @@ def postproc(app, step, t):
     linear_its += app.nlsolver.snes.getLinearSolveIterations()
     nonlinear_its += app.nlsolver.snes.getIterationNumber()
 
-    uout.assign(miniapp.w0.split()[0])
-    hout.assign(miniapp.w0.split()[1])
+    uout.assign(miniapp.w0.subfunctions[0])
+    hout.assign(miniapp.w0.subfunctions[1])
     ofile.write(uout, hout, potential_vorticity(uout), time=t)
 
 
diff --git a/examples/serial/shallow_water/linear_swe_serial.py b/examples/serial/shallow_water/linear_swe_serial.py
index d63df5b7..9863b0b8 100644
--- a/examples/serial/shallow_water/linear_swe_serial.py
+++ b/examples/serial/shallow_water/linear_swe_serial.py
@@ -72,17 +72,26 @@ def form_mass(u, h, v, q):
 
 
 patch_parameters = {
-    'pc_patch_save_operators': True,
-    'pc_patch_partition_of_unity': True,
-    'pc_patch_sub_mat_type': 'seqdense',
-    'pc_patch_construct_dim': 0,
-    'pc_patch_construct_type': 'vanka',
-    'pc_patch_local_type': 'additive',
-    'pc_patch_precompute_element_tensors': True,
-    'pc_patch_symmetrise_sweep': False,
-    'sub_ksp_type': 'preonly',
-    'sub_pc_type': 'lu',
-    'sub_pc_factor_shift_type': 'nonzero',
+    'pc_patch': {
+        'save_operators': True,
+        'partition_of_unity': True,
+        'sub_mat_type': 'seqdense',
+        'construct_dim': 0,
+        'construct_type': 'vanka',
+        'local_type': 'additive',
+        'precompute_element_tensors': True,
+        'symmetrise_sweep': False
+    },
+    'sub': {
+        'ksp_type': 'preonly',
+        'pc_type': 'fieldsplit',
+        'pc_fieldsplit_type': 'schur',
+        'pc_fieldsplit_detect_saddle_point': None,
+        'pc_fieldsplit_schur_fact_type': 'full',
+        'pc_fieldsplit_schur_precondition': 'full',
+        'fieldsplit_ksp_type': 'preonly',
+        'fieldsplit_pc_type': 'lu',
+    }
 }
 
 mg_parameters = {
@@ -108,18 +117,20 @@ def form_mass(u, h, v, q):
         'monitor': None,
         'converged_reason': None,
         'atol': 1e-0,
-        'rtol': 1e-10
+        'rtol': 1e-10,
+        'stol': 1e-12,
     },
     'mat_type': 'matfree',
     'ksp_type': 'fgmres',
-    'ksp_rtol': 1e-10,
     'ksp': {
         'atol': 1e-0,
+        'rtol': 1e-10,
+        'stol': 1e-12,
         'monitor': None,
         'converged_reason': None
     },
     'pc_type': 'mg',
-    'pc_mg_cycle_type': 'v',
+    'pc_mg_cycle_type': 'w',
     'pc_mg_type': 'multiplicative',
     'mg': mg_parameters
 }
diff --git a/examples/shallow_water/galewsky.py b/examples/shallow_water/galewsky.py
index 6debefc5..d5544228 100644
--- a/examples/shallow_water/galewsky.py
+++ b/examples/shallow_water/galewsky.py
@@ -24,6 +24,7 @@
 parser.add_argument('--alpha', type=float, default=0.0001, help='Circulant coefficient.')
 parser.add_argument('--dt', type=float, default=0.5, help='Timestep in hours.')
 parser.add_argument('--filename', type=str, default='galewsky', help='Name of output vtk files')
+parser.add_argument('--metrics_dir', type=str, default='metrics', help='Directory to save paradiag metrics to.')
 parser.add_argument('--show_args', action='store_true', help='Output all the arguments.')
 
 args = parser.parse_known_args()
@@ -38,51 +39,65 @@
 
 # time steps
 
-time_partition = [args.slice_length for _ in range(args.nslices)]
+time_partition = tuple((args.slice_length for _ in range(args.nslices)))
 window_length = sum(time_partition)
 nsteps = args.nwindows*window_length
 
 dt = args.dt*units.hour
 
 # parameters for the implicit diagonal solve in step-(b)
+
+patch_parameters = {
+    'pc_patch': {
+        'save_operators': True,
+        'partition_of_unity': True,
+        'sub_mat_type': 'seqdense',
+        'construct_dim': 0,
+        'construct_type': 'vanka',
+        'local_type': 'additive',
+        'precompute_element_tensors': True,
+        'symmetrise_sweep': False
+    },
+    'sub': {
+        'ksp_type': 'preonly',
+        'pc_type': 'fieldsplit',
+        'pc_fieldsplit_type': 'schur',
+        'pc_fieldsplit_detect_saddle_point': None,
+        'pc_fieldsplit_schur_fact_type': 'full',
+        'pc_fieldsplit_schur_precondition': 'full',
+        'fieldsplit_ksp_type': 'preonly',
+        'fieldsplit_pc_type': 'lu',
+    }
+}
+
+mg_parameters = {
+    'levels': {
+        'ksp_type': 'gmres',
+        'ksp_max_it': 5,
+        'pc_type': 'python',
+        'pc_python_type': 'firedrake.PatchPC',
+        'patch': patch_parameters
+    },
+    'coarse': {
+        'pc_type': 'python',
+        'pc_python_type': 'firedrake.AssembledPC',
+        'assembled_pc_type': 'lu',
+        'assembled_pc_factor_mat_solver_type': 'mumps'
+    }
+}
+
 sparameters = {
     'mat_type': 'matfree',
     'ksp_type': 'fgmres',
     'ksp': {
-        'atol': 1e-8,
-        'rtol': 1e-8,
-        'max_it': 400,
+        'atol': 1e-5,
+        'rtol': 1e-5,
+        'max_it': 60
     },
     'pc_type': 'mg',
     'pc_mg_cycle_type': 'w',
     'pc_mg_type': 'multiplicative',
-    'mg': {
-        'levels': {
-            'ksp_type': 'gmres',
-            'ksp_max_it': 5,
-            'pc_type': 'python',
-            'pc_python_type': 'firedrake.PatchPC',
-            'patch': {
-                'pc_patch_save_operators': True,
-                'pc_patch_partition_of_unity': True,
-                'pc_patch_sub_mat_type': 'seqdense',
-                'pc_patch_construct_dim': 0,
-                'pc_patch_construct_type': 'vanka',
-                'pc_patch_local_type': 'additive',
-                'pc_patch_precompute_element_tensors': True,
-                'pc_patch_symmetrise_sweep': False,
-                'sub_ksp_type': 'preonly',
-                'sub_pc_type': 'lu',
-                'sub_pc_factor_shift_type': 'nonzero',
-            },
-        },
-        'coarse': {
-            'pc_type': 'python',
-            'pc_python_type': 'firedrake.AssembledPC',
-            'assembled_pc_type': 'lu',
-            'assembled_pc_factor_mat_solver_type': 'mumps',
-        },
-    }
+    'mg': mg_parameters
 }
 
 sparameters_diag = {
@@ -91,18 +106,23 @@
         'monitor': None,
         'converged_reason': None,
         'atol': 1e-0,
-        'rtol': 1e-8,
+        'rtol': 1e-10,
         'stol': 1e-12,
-        'max_its': 1
+        'ksp_ew': None,
+        'ksp_ew_version': 1,
     },
     'mat_type': 'matfree',
     'ksp_type': 'fgmres',
     'ksp': {
         'monitor': None,
         'converged_reason': None,
+        'rtol': 1e-5,
+        'atol': 1e-0,
     },
     'pc_type': 'python',
-    'pc_python_type': 'asQ.DiagFFTPC'
+    'pc_python_type': 'asQ.DiagFFTPC',
+    'diagfft_state': 'window',
+    'aaos_jacobian_state': 'current'
 }
 
 sparameters_diag['diagfft_block_'] = sparameters
@@ -119,6 +139,7 @@
                                   velocity_expression=galewsky.velocity_expression,
                                   depth_expression=galewsky.depth_expression,
                                   reference_depth=galewsky.H0,
+                                  reference_state=True,
                                   create_mesh=create_mesh,
                                   dt=dt, theta=0.5,
                                   alpha=args.alpha, time_partition=time_partition,
@@ -126,6 +147,12 @@
                                   file_name='output/'+args.filename)
 
 
+ics = miniapp.aaos.initial_condition
+miniapp.aaos.reference_state.assign(ics)
+miniapp.aaos.reference_state.subfunctions[0].assign(0)
+miniapp.aaos.reference_state.subfunctions[1].assign(galewsky.H0)
+
+
 def window_preproc(swe_app, pdg, wndw):
     PETSc.Sys.Print('')
     PETSc.Sys.Print(f'### === --- Calculating time-window {wndw} --- === ###')
@@ -151,7 +178,7 @@ def window_postproc(swe_app, pdg, wndw):
 PETSc.Sys.Print('### === --- Iteration counts --- === ###')
 
 from asQ import write_paradiag_metrics
-write_paradiag_metrics(miniapp.paradiag)
+write_paradiag_metrics(miniapp.paradiag, directory=args.metrics_dir)
 
 PETSc.Sys.Print('')
 
diff --git a/examples/shallow_water/linear_gravity_bumps.py b/examples/shallow_water/linear_gravity_bumps.py
index 42252c83..b33f8211 100644
--- a/examples/shallow_water/linear_gravity_bumps.py
+++ b/examples/shallow_water/linear_gravity_bumps.py
@@ -22,7 +22,8 @@
 parser.add_argument('--slice_length', type=int, default=2, help='Number of timesteps per time-slice.')
 parser.add_argument('--alpha', type=float, default=0.0001, help='Circulant coefficient.')
 parser.add_argument('--dt', type=float, default=0.05, help='Timestep in hours.')
-parser.add_argument('--filename', type=str, default='gravity_waves', help='Name of output vtk files')
+parser.add_argument('--filename', type=str, default='gravity_waves', help='Name of output vtk files.')
+parser.add_argument('--metrics_dir', type=str, default='metrics', help='Directory to save paradiag metrics to.')
 parser.add_argument('--show_args', action='store_true', help='Output all the arguments.')
 
 args = parser.parse_known_args()
@@ -58,8 +59,13 @@
     },
     'sub': {
         'ksp_type': 'preonly',
-        'pc_type': 'lu',
-        'pc_factor_shift_type': 'nonzero'
+        'pc_type': 'fieldsplit',
+        'pc_fieldsplit_type': 'schur',
+        'pc_fieldsplit_detect_saddle_point': None,
+        'pc_fieldsplit_schur_fact_type': 'full',
+        'pc_fieldsplit_schur_precondition': 'full',
+        'fieldsplit_ksp_type': 'preonly',
+        'fieldsplit_pc_type': 'lu',
     }
 }
 
@@ -83,12 +89,12 @@
     'mat_type': 'matfree',
     'ksp_type': 'fgmres',
     'ksp': {
-        'atol': 1e-8,
-        'rtol': 1e-8,
-        'max_it': 400
+        'atol': 1e-5,
+        'rtol': 1e-5,
+        'max_it': 60
     },
     'pc_type': 'mg',
-    'pc_mg_cycle_type': 'v',
+    'pc_mg_cycle_type': 'w',
     'pc_mg_type': 'multiplicative',
     'mg': mg_parameters
 }
@@ -111,7 +117,9 @@
         'converged_reason': None,
     },
     'pc_type': 'python',
-    'pc_python_type': 'asQ.DiagFFTPC'
+    'pc_python_type': 'asQ.DiagFFTPC',
+    'diagfft_state': 'linear',
+    'aaos_jacobian_state': 'linear',
 }
 
 sparameters_diag['diagfft_block'] = sparameters
@@ -162,7 +170,7 @@ def window_postproc(swe_app, pdg, wndw):
 PETSc.Sys.Print('### === --- Iteration counts --- === ###')
 
 from asQ import write_paradiag_metrics
-write_paradiag_metrics(miniapp.paradiag)
+write_paradiag_metrics(miniapp.paradiag, directory=args.metrics_dir)
 
 PETSc.Sys.Print('')
 
diff --git a/examples/shallow_water/williamson2.py b/examples/shallow_water/williamson2.py
index 44108382..5ee8137f 100644
--- a/examples/shallow_water/williamson2.py
+++ b/examples/shallow_water/williamson2.py
@@ -10,6 +10,8 @@
 import utils.shallow_water as swe
 import utils.shallow_water.williamson1992.case2 as case2
 
+Print = PETSc.Sys.Print
+
 PETSc.Sys.popErrorHandler()
 
 # get command arguments
@@ -23,11 +25,9 @@
 parser.add_argument('--nwindows', type=int, default=1, help='Number of time-windows.')
 parser.add_argument('--nslices', type=int, default=2, help='Number of time-slices per time-window.')
 parser.add_argument('--slice_length', type=int, default=2, help='Number of timesteps per time-slice.')
-parser.add_argument('--nspatial_domains', type=int, default=2, help='Size of spatial partition.')
 parser.add_argument('--alpha', type=float, default=0.0001, help='Circulant coefficient.')
 parser.add_argument('--dt', type=float, default=0.5, help='Timestep in hours.')
 parser.add_argument('--filename', type=str, default='w5diag', help='Name of output vtk files')
-parser.add_argument('--coords_degree', type=int, default=1, help='Degree of polynomials for sphere mesh approximation.')
 parser.add_argument('--degree', type=int, default=1, help='Degree of finite element space (the DG space).')
 parser.add_argument('--show_args', action='store_true', help='Output all the arguments.')
 
@@ -35,44 +35,38 @@
 args = args[0]
 
 if args.show_args:
-    PETSc.Sys.Print(args)
+    Print(args)
 
-PETSc.Sys.Print('')
-PETSc.Sys.Print('### === --- Setting up --- === ###')
-PETSc.Sys.Print('')
+Print('')
+Print('### === --- Setting up --- === ###')
+Print('')
 
 # time steps
 
-time_partition = [args.slice_length for _ in range(args.nslices)]
+time_partition = tuple((args.slice_length for _ in range(args.nslices)))
 window_length = sum(time_partition)
 nsteps = args.nwindows*window_length
 
 dt = args.dt*units.hour
-
 # multigrid mesh set up
 
 ensemble = asQ.create_ensemble(time_partition)
 
-distribution_parameters = {"partition": True, "overlap_type": (fd.DistributedMeshOverlapType.VERTEX, 2)}
-
 # mesh set up
-mesh = mg.icosahedral_mesh(R0=earth.radius,
-                           base_level=args.base_level,
-                           degree=args.coords_degree,
-                           distribution_parameters=distribution_parameters,
-                           nrefs=args.ref_level-args.base_level,
-                           comm=ensemble.comm)
+mesh = swe.create_mg_globe_mesh(comm=ensemble.comm,
+                                base_level=args.base_level,
+                                ref_level=args.ref_level,
+                                coords_degree=1)
+
 x = fd.SpatialCoordinate(mesh)
 
 # Mixed function space for velocity and depth
-V1 = swe.default_velocity_function_space(mesh, degree=args.degree)
-V2 = swe.default_depth_function_space(mesh, degree=args.degree)
-W = fd.MixedFunctionSpace((V1, V2))
+W = swe.default_function_space(mesh, degree=args.degree)
+V1, V2 = W.subfunctions[:]
 
 # initial conditions
 w0 = fd.Function(W)
-un = w0.subfunctions[0]
-hn = w0.subfunctions[1]
+un, hn = w0.subfunctions[:]
 
 f = case2.coriolis_expression(*x)
 b = case2.topography_function(*x, V2, name="Topography")
@@ -85,64 +79,114 @@
 
 # nonlinear swe forms
 
-def form_function(u, h, v, q):
-    return swe.nonlinear.form_function(mesh, earth.Gravity, b, f, u, h, v, q)
+def form_function(u, h, v, q, t):
+    return swe.nonlinear.form_function(mesh, earth.Gravity, b, f, u, h, v, q, t)
 
 
 def form_mass(u, h, v, q):
     return swe.nonlinear.form_mass(mesh, u, h, v, q)
 
 
+def jacobian_function(u, h, v, q, t):
+    return swe.linear.form_function(mesh, earth.Gravity, H, f, u, h, v, q, t)
+
+
+def jacobian_mass(u, h, v, q):
+    return swe.linear.form_mass(mesh, u, h, v, q)
+
+
 # parameters for the implicit diagonal solve in step-(b)
+patch_parameters = {
+    'pc_patch': {
+        'save_operators': True,
+        'partition_of_unity': True,
+        'sub_mat_type': 'seqdense',
+        'construct_dim': 0,
+        'construct_type': 'vanka',
+        'local_type': 'additive',
+        'precompute_element_tensors': True,
+        'symmetrise_sweep': False
+    },
+    'sub': {
+        'ksp_type': 'preonly',
+        'pc_type': 'fieldsplit',
+        'pc_fieldsplit_type': 'schur',
+        'pc_fieldsplit_detect_saddle_point': None,
+        'pc_fieldsplit_schur_fact_type': 'full',
+        'pc_fieldsplit_schur_precondition': 'full',
+        'fieldsplit_ksp_type': 'preonly',
+        'fieldsplit_pc_type': 'lu',
+    }
+}
+
+mg_parameters = {
+    'levels': {
+        'ksp_type': 'gmres',
+        'ksp_max_it': 5,
+        'pc_type': 'python',
+        'pc_python_type': 'firedrake.PatchPC',
+        'patch': patch_parameters
+    },
+    'coarse': {
+        'pc_type': 'python',
+        'pc_python_type': 'firedrake.AssembledPC',
+        'assembled_pc_type': 'lu',
+        'assembled_pc_factor_mat_solver_type': 'mumps'
+    }
+}
+
 sparameters = {
     'mat_type': 'matfree',
     'ksp_type': 'fgmres',
-    'ksp_atol': 1e-8,
-    'ksp_rtol': 1e-8,
-    'ksp_max_it': 400,
+    'ksp': {
+        'atol': 1e-5,
+        'rtol': 1e-5,
+        'max_it': 60
+    },
     'pc_type': 'mg',
-    'pc_mg_cycle_type': 'v',
+    'pc_mg_cycle_type': 'w',
     'pc_mg_type': 'multiplicative',
-    'mg_levels_ksp_type': 'gmres',
-    'mg_levels_ksp_max_it': 5,
-    'mg_levels_pc_type': 'python',
-    'mg_levels_pc_python_type': 'firedrake.PatchPC',
-    'mg_levels_patch_pc_patch_save_operators': True,
-    'mg_levels_patch_pc_patch_partition_of_unity': True,
-    'mg_levels_patch_pc_patch_sub_mat_type': 'seqdense',
-    'mg_levels_patch_pc_patch_construct_dim': 0,
-    'mg_levels_patch_pc_patch_construct_type': 'vanka',
-    'mg_levels_patch_pc_patch_local_type': 'additive',
-    'mg_levels_patch_pc_patch_precompute_element_tensors': True,
-    'mg_levels_patch_pc_patch_symmetrise_sweep': False,
-    'mg_levels_patch_sub_ksp_type': 'preonly',
-    'mg_levels_patch_sub_pc_type': 'lu',
-    'mg_levels_patch_sub_pc_factor_shift_type': 'nonzero',
-    'mg_coarse_pc_type': 'python',
-    'mg_coarse_pc_python_type': 'firedrake.AssembledPC',
-    'mg_coarse_assembled_pc_type': 'lu',
-    'mg_coarse_assembled_pc_factor_mat_solver_type': 'mumps',
+    'mg': mg_parameters
 }
 
+atol = 1e0
 sparameters_diag = {
-    'snes_linesearch_type': 'basic',
-    'snes_monitor': None,
-    'snes_converged_reason': None,
-    'snes_atol': 1e-0,
-    'snes_rtol': 1e-12,
-    'snes_stol': 1e-12,
-    'snes_max_it': 100,
+    'snes': {
+        'linesearch_type': 'basic',
+        'monitor': None,
+        'converged_reason': None,
+        'atol': atol,
+        'rtol': 1e-10,
+        'stol': 1e-12,
+        'ksp_ew': None,
+        'ksp_ew_version': 1,
+    },
     'mat_type': 'matfree',
     'ksp_type': 'fgmres',
-    'ksp_monitor': None,
-    'ksp_converged_reason': None,
+    'ksp': {
+        'monitor': None,
+        'converged_reason': None,
+        'rtol': 1e-5,
+        'atol': atol,
+    },
     'pc_type': 'python',
-    'pc_python_type': 'asQ.DiagFFTPC'}
+    'pc_python_type': 'asQ.DiagFFTPC',
+    'diagfft_state': 'reference',
+    'diagfft_linearisation': 'user',
+    'aaos_jacobian_state': 'reference',
+    'aaos_jacobian_linearisation': 'user',
+}
+
+# reference conditions
+wref = w0.copy(deepcopy=True)
+uref, href = wref.subfunctions[:]
+# uref.assign(0)
+# href.assign(H)
 
-PETSc.Sys.Print('### === --- Calculating parallel solution --- === ###')
-PETSc.Sys.Print('')
+Print('### === --- Calculating parallel solution --- === ###')
+# Print('')
 
-sparameters_diag['diagfft_block_'] = sparameters
+sparameters_diag['diagfft_block'] = sparameters
 
 # non-petsc information for block solve
 block_ctx = {}
@@ -154,22 +198,25 @@ def form_mass(u, h, v, q):
     tm = mg.manifold_transfer_manager(W)
     transfer_managers.append(tm)
 
-block_ctx['diag_transfer_managers'] = transfer_managers
+block_ctx['diagfft_transfer_managers'] = transfer_managers
 
 PD = asQ.paradiag(ensemble=ensemble,
                   form_function=form_function,
-                  form_mass=form_mass, w0=w0,
+                  form_mass=form_mass,
+                  jacobian_function=jacobian_function,
+                  jacobian_mass=jacobian_mass,
+                  w0=w0, reference_state=wref,
                   dt=dt, theta=0.5,
                   alpha=args.alpha,
-                  time_partition=time_partition, solver_parameters=sparameters_diag,
-                  circ=None, tol=1.0e-6, maxits=None,
-                  ctx={}, block_ctx=block_ctx, block_mat_type="aij")
+                  time_partition=time_partition,
+                  solver_parameters=sparameters_diag,
+                  circ=None, block_ctx=block_ctx)
 
 
 def window_preproc(pdg, wndw):
-    PETSc.Sys.Print('')
-    PETSc.Sys.Print(f'### === --- Calculating time-window {wndw} --- === ###')
-    PETSc.Sys.Print('')
+    Print('')
+    Print(f'### === --- Calculating time-window {wndw} --- === ###')
+    Print('')
 
 
 # check against initial conditions
@@ -209,8 +256,8 @@ def for_each_callback(window_index, slice_index, w):
             timestep = wndw*window_length + window_index
             uerr = errors[window_index, 0]
             herr = errors[window_index, 1]
-            PETSc.Sys.Print(f"timestep={timestep}, uerr={uerr}, herr={herr}",
-                            comm=ensemble.comm)
+            Print(f"timestep={timestep}, uerr={uerr}, herr={herr}",
+                  comm=ensemble.comm)
 
 
 PD.solve(nwindows=args.nwindows,
diff --git a/examples/shallow_water/williamson5.py b/examples/shallow_water/williamson5.py
index df23b734..4256f340 100644
--- a/examples/shallow_water/williamson5.py
+++ b/examples/shallow_water/williamson5.py
@@ -44,7 +44,6 @@
 nsteps = args.nwindows*window_length
 
 dt = args.dt*units.hour
-
 # parameters for the implicit diagonal solve in step-(b)
 sparameters = {
     'mat_type': 'matfree',
diff --git a/examples/stratigraphic_model/mms.py b/examples/stratigraphic_model/mms.py
new file mode 100644
index 00000000..e0ac5526
--- /dev/null
+++ b/examples/stratigraphic_model/mms.py
@@ -0,0 +1,86 @@
+import firedrake as fd
+from firedrake.petsc import PETSc
+from math import pi
+# A toy equation for Stratigraphic modelling.
+
+
+class Problem(object):
+    def __init__(self, N, degree):
+        super().__init__()
+        self.N = N
+        self.degree = degree
+
+    def mesh(self):
+        mesh = fd.RectangleMesh(self.N, self.N, 100, 100, quadrilateral=False)
+        return mesh
+
+    def function_space(self, mesh):
+        Ve = fd.FiniteElement("CG", mesh.ufl_cell(), self.degree)
+        return fd.FunctionSpace(mesh, Ve)
+
+
+if __name__ == "__main__":
+
+    for m in [1, 2, 4]:
+        N = m*500
+        problem = Problem(N, 1)
+        mesh = problem.mesh()
+        n = fd.FacetNormal(mesh)
+        Z = problem.function_space(mesh)
+        x, y = fd.SpatialCoordinate(mesh)
+        PETSc.Sys.Print("Z.dim():%s" % Z.dim())
+        sp = {
+            "snes_max_it": 2000,
+            "snes_atol": 1.0e-8,
+            "snes_rtol": 1.0e-8,
+            "snes_monitor": None,
+            "snes_linesearch_type": "l2",
+            "snes_converged_reason": None,
+            "mat_type": "aij",
+            "ksp_type": "preonly",
+            "pc_type": "lu",
+            "pc_factor_mat_solver_type": "mumps"
+        }
+
+        dt = fd.Constant(.001/m)
+        A = fd.Constant(50)
+        t = fd.Constant(0.0)
+        s0 = fd.Function(Z)
+        s0.interpolate(fd.sin((x+2*y)))
+        s_exact = fd.Function(Z)
+#        print(fd.norm(s_exact))
+        s = fd.Function(Z)
+        q = fd.TestFunction(Z)
+# D_c is a constant diffusion parameter.
+        D_c = fd.Constant(2e-3)
+# G_0 is a growth constant for the carbonate.
+        G_0 = fd.Constant(4e-3)
+# b is the seabed.
+        b = 100*fd.tanh((x-50)/20)
+# l is the sea level.
+        l = A*fd.sin(2*pi*t/500000)
+# d is the water depth.
+        d = l-b-s
+        d_exact = l-b-s_exact
+
+        D = 2*D_c/fd.sqrt(2*pi)*fd.exp(-1/2*((d-5)/10)**2)
+        D_exact = 2*D_c/fd.sqrt(2*pi)*fd.exp(-1/2*((d_exact-5)/10)**2)
+
+        G = G_0*fd.conditional(d > 0, fd.exp(-d/10)/(1 + fd.exp(-50*d)), fd.exp((50-1/10)*d)/(fd.exp(50*d) + 1))
+        G_exact = G_0*fd.conditional(d_exact > 0, fd.exp(-d_exact/10)/(1 + fd.exp(-50*d_exact)), fd.exp((50-1/10)*d_exact)/(fd.exp(50*d_exact) + 1))
+
+        Rhs = fd.div(D_exact*fd.grad(s_exact)) + G_exact
+
+        F = D*fd.inner(fd.grad(s), fd.grad(q))*fd.dx - G*q*fd.dx - Rhs*q*fd.dx - D_exact*fd.inner(fd.inner(fd.grad(s_exact), n), q)*fd.ds
+
+        F_euler = (fd.inner(s, q)*fd.dx - fd.inner(s0, q)*fd.dx + dt*(F))
+        nvproblem = fd.NonlinearVariationalProblem(F_euler, s)
+        solver = fd.NonlinearVariationalSolver(nvproblem, solver_parameters=sp)
+
+        for i in range(1):
+            t.assign(t+dt)
+            s_exact = fd.interpolate((t+1)*fd.sin((x+2*y)), Z)
+            solver.solve()
+            A = fd.errornorm(s, s_exact)/fd.norm(s_exact)
+            PETSc.Sys.Print("E :%s" % A)
+            s0.assign(s)
diff --git a/examples/stratigraphic_model/serial.py b/examples/stratigraphic_model/serial.py
new file mode 100644
index 00000000..7da5ce2e
--- /dev/null
+++ b/examples/stratigraphic_model/serial.py
@@ -0,0 +1,77 @@
+import firedrake as fd
+from firedrake.petsc import PETSc
+from math import pi
+
+# A toy equation for Stratigraphic modelling.
+
+
+class Problem(object):
+    def __init__(self, N, degree):
+        super().__init__()
+        self.N = N
+        self.degree = degree
+
+    def mesh(self):
+        mesh = fd.RectangleMesh(self.N, self.N, 100, 100, quadrilateral=False)
+        return mesh
+
+#    Initial condition on thickness of the carbonate layer.
+    def initial_condition(self, Z):
+        (x, y) = fd.SpatialCoordinate(Z.mesh())
+        s = fd.Function(Z)
+        s.interpolate(fd.Constant(0))
+        return s
+
+    def function_space(self, mesh):
+        Ve = fd.FiniteElement("CG", mesh.ufl_cell(), self.degree)
+        return fd.FunctionSpace(mesh, Ve)
+
+
+if __name__ == "__main__":
+    N = 1000
+    problem = Problem(N, 1)
+    mesh = problem.mesh()
+    Z = problem.function_space(mesh)
+    s0 = problem.initial_condition(Z)
+    s = fd.Function(Z)
+    q = fd.TestFunction(Z)
+    x, y = fd.SpatialCoordinate(mesh)
+    PETSc.Sys.Print("Z.dim():%s" % Z.dim())
+    sp = {
+        "snes_max_it": 2000,
+        "snes_atol": 1.0e-8,
+        "snes_rtol": 1.0e-8,
+        "snes_monitor": None,
+        "snes_linesearch_type": "l2",
+        "snes_converged_reason": None,
+        "mat_type": "aij",
+        "ksp_type": "preonly",
+        "pc_type": "lu",
+        "pc_factor_mat_solver_type": "mumps"
+    }
+
+    dt = fd.Constant(1000)
+    A = fd.Constant(50)
+    t = fd.Constant(0)
+# D_c is a constant diffusion parameter.
+    D_c = fd.Constant(2e-3)
+# G_0 is a growth constant for the carbonate.
+    G_0 = fd.Constant(4e-3)
+# b is the seabed.
+    b = 100*fd.tanh((x-50)/20)
+# l is the sea level.
+    l = A*fd.sin(2*pi*t/500000)
+# d is the water depth.
+    d = l-b-s
+    D = 2*D_c/fd.sqrt(2*pi)*fd.exp(-1/2*((d-5)/10)**2)
+    G = fd.conditional(d > 0, G_0*fd.exp(-d/10), G_0*fd.exp((d/.1)**3))
+    F = D*fd.inner(fd.grad(s), fd.grad(q))*fd.dx - G*q*fd.dx
+    F_euler = (fd.inner(s, q)*fd.dx - fd.inner(s0, q)*fd.dx + dt*(F))
+    nvproblem = fd.NonlinearVariationalProblem(F_euler, s)
+    solver = fd.NonlinearVariationalSolver(nvproblem, solver_parameters=sp)
+    outfile = fd.File("s.pvd")
+    while (float(t) < 128*float(dt)):
+        t.assign(float(t+dt))
+        solver.solve()
+        s0.assign(s)
+        outfile.write(s)
diff --git a/examples/stratigraphic_model/stratigraphic.py b/examples/stratigraphic_model/stratigraphic.py
new file mode 100644
index 00000000..1b501916
--- /dev/null
+++ b/examples/stratigraphic_model/stratigraphic.py
@@ -0,0 +1,221 @@
+import matplotlib.pyplot as plt
+from math import pi
+from matplotlib.animation import FuncAnimation
+import firedrake as fd
+from firedrake.petsc import PETSc
+import asQ
+
+import argparse
+
+parser = argparse.ArgumentParser(
+    description='Paradiag for Stratigraphic model that simulate formation of sedimentary rock over geological time.',
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--nx', type=int, default=200, help='Number of cells along each square side.')
+parser.add_argument('--degree', type=int, default=1, help='Degree of the scalar and velocity spaces.')
+parser.add_argument('--theta', type=float, default=0.5, help='Parameter for the implicit theta timestepping method.')
+parser.add_argument('--nwindows', type=int, default=1, help='Number of time-windows.')
+parser.add_argument('--nslices', type=int, default=2, help='Number of time-slices per time-window.')
+parser.add_argument('--slice_length', type=int, default=2, help='Number of timesteps per time-slice.')
+parser.add_argument('--alpha', type=float, default=0.0001, help='Circulant coefficient.')
+parser.add_argument('--nsample', type=int, default=16, help='Number of sample points for plotting.')
+parser.add_argument('--show_args', action='store_true', help='Output all the arguments.')
+
+args = parser.parse_known_args()
+args = args[0]
+
+if args.show_args:
+    PETSc.Sys.Print(args)
+# The time partition describes how many timesteps are included on each time-slice of the ensemble
+# Here we use the same number of timesteps on each slice, but they can be different
+
+time_partition = tuple(args.slice_length for _ in range(args.nslices))
+window_length = sum(time_partition)
+nsteps = args.nwindows*window_length
+
+dt = 1000
+
+# The Ensemble with the spatial and time communicators
+ensemble = asQ.create_ensemble(time_partition)
+
+# # # === --- domain --- === # # #
+
+# The mesh needs to be created with the spatial communicator
+mesh = fd.SquareMesh(args.nx, args.nx, 100, quadrilateral=False, comm=ensemble.comm)
+
+V = fd.FunctionSpace(mesh, "CG", args.degree)
+
+# # # === --- initial conditions --- === # # #
+
+x, y = fd.SpatialCoordinate(mesh)
+
+s0 = fd.Function(V, name="scalar_initial")
+s0.interpolate(fd.Constant(0.0))
+
+
+# The sediment movement D
+def D(D_c, d):
+    return D_c*2/fd.Constant(fd.sqrt(2*pi))*fd.exp(-1/2*((d-5)/10)**2)
+
+
+# The carbonate growth L.
+def L(G_0, d):
+    return G_0*fd.conditional(d > 0, fd.exp(-d/10)/(1 + fd.exp(-50*d)), fd.exp((50-1/10)*d)/(fd.exp(50*d) + 1))
+
+
+# # # === --- finite element forms --- === # # #
+
+
+def form_mass(s, q):
+    return s*q*fd.dx
+
+
+D_c = fd.Constant(.002)
+G_0 = fd.Constant(.004)
+A = fd.Constant(50)
+b = 100*fd.tanh(1/20*(x-50))
+
+
+def form_function(s, q, t):
+    return D(D_c, A*fd.sin(2*pi*t/500000)-b-s)*fd.inner(fd.grad(s), fd.grad(q))*fd.dx-L(G_0, A*fd.sin(2*pi*t/500000)-b-s)*q*fd.dx
+
+
+# # # === --- PETSc solver parameters --- === # # #
+
+
+# The PETSc solver parameters used to solve the
+# blocks in step (b) of inverting the ParaDiag matrix.
+block_parameters = {
+    "ksp_type": "preonly",
+    "pc_type": "lu",
+}
+
+# The PETSc solver parameters for solving the all-at-once system.
+# The python preconditioner 'asQ.DiagFFTPC' applies the ParaDiag matrix.
+#
+# The equation is linear so we can either:
+# a) Solve it in one shot using a preconditioned Krylov method:
+#    P^{-1}Au = P^{-1}b
+#    The solver options for this are:
+#    'ksp_type': 'fgmres'
+#    We need fgmres here because gmres is used on the blocks.
+# b) Solve it with Picard iterations:
+#    Pu_{k+1} = (P - A)u_{k} + b
+#    The solver options for this are:
+#    'ksp_type': 'preonly'
+
+paradiag_parameters = {
+    'snes': {
+        'linesearch_type': 'basic',
+        'monitor': None,
+        'converged_reason': None,
+        'rtol': 1e-10,
+        'atol': 1e-12,
+        'stol': 1e-12,
+    },
+    'mat_type': 'matfree',
+    'ksp_type': 'fgmres',
+    'ksp': {
+        'monitor': None,
+        'converged_reason': None,
+        'rtol': 1e-10,
+        'atol': 1e-12,
+        'stol': 1e-12,
+    },
+    'pc_type': 'python',
+    'pc_python_type': 'asQ.DiagFFTPC'
+}
+# We need to add a block solver parameters dictionary for each block.
+# Here they are all the same but they could be different.
+for i in range(window_length):
+    paradiag_parameters['diagfft_block_'+str(i)+'_'] = block_parameters
+
+
+# # # === --- Setup ParaDiag --- === # # #
+
+
+# Give everything to asQ to create the paradiag object.
+# the circ parameter determines where the alpha-circulant
+# approximation is introduced. None means only in the preconditioner.
+pdg = asQ.paradiag(ensemble=ensemble,
+                   form_function=form_function,
+                   form_mass=form_mass,
+                   w0=s0, dt=dt, theta=args.theta,
+                   alpha=args.alpha, time_partition=time_partition,
+                   solver_parameters=paradiag_parameters,
+                   circ=None)
+
+
+# This is a callback which will be called before pdg solves each time-window
+# We can use this to make the output a bit easier to read
+def window_preproc(pdg, wndw):
+    PETSc.Sys.Print('')
+    PETSc.Sys.Print(f'### === --- Calculating time-window {wndw} --- === ###')
+    PETSc.Sys.Print('')
+
+
+# The last time-slice will be saving snapshots to create an animation.
+# The layout member describes the time_partition.
+# layout.is_local(i) returns True/False if the timestep index i is on the
+# current time-slice. Here we use -1 to mean the last timestep in the window.
+is_last_slice = pdg.layout.is_local(-1)
+
+# Make an output Function on the last time-slice and start a snapshot list
+if is_last_slice:
+    qout = fd.Function(V)
+    timeseries = [s0.copy(deepcopy=True)]
+
+
+# This is a callback which will be called after pdg solves each time-window
+# We can use this to save the last timestep of each window for plotting.
+def window_postproc(pdg, wndw):
+    if is_last_slice:
+        # The aaos is the AllAtOnceSystem which represents the time-dependent problem.
+        # get_field extracts one timestep of the window. -1 is again used to get the last
+        # timestep and place it in qout.
+        pdg.aaos.get_field(-1, index_range='window', wout=qout)
+        timeseries.append(qout.copy(deepcopy=True))
+
+
+# Solve nwindows of the all-at-once system
+pdg.solve(args.nwindows,
+          preproc=window_preproc,
+          postproc=window_postproc)
+
+
+# # # === --- Postprocessing --- === # # #
+
+# paradiag collects a few solver diagnostics for us to inspect
+nw = args.nwindows
+
+# Number of nonlinear iterations, total and per window.
+# (1 for fgmres and # picard iterations for preonly)
+PETSc.Sys.Print(f'nonlinear iterations: {pdg.nonlinear_iterations}  |  iterations per window: {pdg.nonlinear_iterations/nw}')
+
+# Number of linear iterations, total and per window.
+# (# of gmres iterations for fgmres and # picard iterations for preonly)
+PETSc.Sys.Print(f'linear iterations: {pdg.linear_iterations}  |  iterations per window: {pdg.linear_iterations/nw}')
+
+# Number of iterations needed for each block in step-(b), total and per block solve
+# The number of iterations for each block will usually be different because of the different eigenvalues
+PETSc.Sys.Print(f'block linear iterations: {pdg.block_iterations._data}  |  iterations per block solve: {pdg.block_iterations._data/pdg.linear_iterations}')
+
+# We can write these diagnostics to file, along with some other useful information.
+# Files written are: aaos_metrics.txt, block_metrics.txt, paradiag_setup.txt, solver_parameters.txt
+asQ.write_paradiag_metrics(pdg)
+
+# Make an animation from the snapshots we collected and save it to periodic.mp4.
+if is_last_slice:
+
+    fn_plotter = fd.FunctionPlotter(mesh, num_sample_points=args.nsample)
+
+    fig, axes = plt.subplots()
+    axes.set_aspect('equal')
+    colors = fd.tripcolor(qout, num_sample_points=args.nsample, axes=axes)
+    fig.colorbar(colors)
+
+    def animate(q):
+        colors.set_array(fn_plotter(q))
+
+    animation = FuncAnimation(fig, animate, frames=timeseries)
+
+    animation.save("periodic.mp4", writer="ffmpeg")
diff --git a/tests/conftest.py b/tests/conftest.py
deleted file mode 100644
index e08ba854..00000000
--- a/tests/conftest.py
+++ /dev/null
@@ -1,66 +0,0 @@
-"""Global test configuration."""
-
-import pytest
-
-from subprocess import check_call
-
-
-def parallel(item):
-    """Run a test in parallel.
-
-    :arg item: The test item to run.
-    """
-    from mpi4py import MPI
-    if MPI.COMM_WORLD.size > 1:
-        raise RuntimeError("parallel test can't be run within parallel environment")
-    marker = item.get_closest_marker("parallel")
-    if marker is None:
-        raise RuntimeError("Parallel test doesn't have parallel marker")
-    nprocs = marker.kwargs.get("nprocs", 3)
-    if nprocs < 2:
-        raise RuntimeError("Need at least two processes to run parallel test")
-    # Only spew tracebacks on rank 0.
-    # Run xfailing tests to ensure that errors are reported to calling process
-
-    call = ["mpiexec", "-n", "1", "python", "-m", "pytest", "--runxfail", "-s", "-q", "%s::%s" % (item.fspath, item.name)]
-    call.extend([":", "-n", "%d" % (nprocs - 1), "python", "-m", "pytest", "--runxfail", "--tb=no", "-q",
-                 "%s::%s" % (item.fspath, item.name)])
-    check_call(call)
-
-
-def pytest_runtest_call(item):
-    from mpi4py import MPI
-    if item.get_closest_marker("parallel") and MPI.COMM_WORLD.size == 1:
-        # Spawn parallel processes to run test
-        parallel(item)
-
-
-def pytest_configure(config):
-    """Register an additional marker."""
-    config.addinivalue_line(
-        "markers",
-        "parallel(nprocs): mark test to run in parallel on nprocs processors")
-
-
-@pytest.fixture(autouse=True)
-def old_pytest_runtest_setup(request):
-    item = request.node
-    if item.get_closest_marker("parallel"):
-        from mpi4py import MPI
-        if MPI.COMM_WORLD.size > 1:
-            # Turn on source hash checking
-            from firedrake import parameters
-            from functools import partial
-
-            def _reset(check):
-                parameters["pyop2_options"]["check_src_hashes"] = check
-
-            # Reset to current value when test is cleaned up
-            item.addfinalizer(partial(_reset,
-                                      parameters["pyop2_options"]["check_src_hashes"]))
-
-            parameters["pyop2_options"]["check_src_hashes"] = True
-        else:
-            # Blow away function arg in "master" process, to ensure
-            # this test isn't run on only one process.
-            item.obj = lambda *args, **kwargs: None
diff --git a/tests/test_paradiag.py b/tests/test_paradiag.py
index bdf11c08..0c95ff1b 100644
--- a/tests/test_paradiag.py
+++ b/tests/test_paradiag.py
@@ -56,12 +56,12 @@ def test_galewsky_timeseries():
     h_initial.project(galewsky.depth_expression(*x))
 
     # shallow water equation forms
-    def form_function(u, h, v, q):
+    def form_function(u, h, v, q, t):
         return swe.nonlinear.form_function(mesh,
                                            gravity,
                                            topography,
                                            coriolis,
-                                           u, h, v, q)
+                                           u, h, v, q, t)
 
     def form_mass(u, h, v, q):
         return swe.nonlinear.form_mass(mesh, u, h, v, q)
@@ -123,8 +123,10 @@ def form_mass(u, h, v, q):
         'monitor': None,
         'converged_reason': None,
         'atol': 1e-0,
-        'rtol': 1e-12,
+        'rtol': 1e-10,
         'stol': 1e-12,
+        'ksp_ew': None,
+        'ksp_ew_version': 1,
     }
 
     # solver parameters for serial method
@@ -139,7 +141,7 @@ def form_mass(u, h, v, q):
     parallel_sparameters = {
         'snes': snes_sparameters,
         'mat_type': 'matfree',
-        'ksp_type': 'preonly',
+        'ksp_type': 'fgmres',
         'ksp': {
             'monitor': None,
             'converged_reason': None,
@@ -202,6 +204,122 @@ def parallel_postproc(pdg, wndw):
         assert err/norm0 < 1e-5
 
 
+@pytest.mark.parallel(nprocs=4)
+def test_Nitsche_heat_timeseries():
+    from utils.serial import ComparisonMiniapp
+    from copy import deepcopy
+
+    nwindows = 1
+    nslices = 2
+    slice_length = 2
+    alpha = 0.0001
+    dt = 0.5
+    theta = 0.5
+
+    time_partition = [slice_length for _ in range(nslices)]
+    ensemble = asQ.create_ensemble(time_partition)
+    nx = 10
+    mesh = fd.UnitSquareMesh(nx, nx, comm=ensemble.comm)
+    x, y = fd.SpatialCoordinate(mesh)
+    n = fd.FacetNormal(mesh)
+
+    W = fd.FunctionSpace(mesh, 'CG', 1)
+
+    # initial conditions
+    w_initial = fd.Function(W)
+    w_initial.interpolate(fd.exp(0.5*x + y))
+
+    # Heat equaion with Nitsch BCs.
+    def form_function(u, v, t):
+        return fd.inner(fd.grad(u), fd.grad(v))*fd.dx - fd.inner(v, fd.inner(fd.grad(u), n))*fd.ds - fd.inner(u-fd.exp(0.5*x + y + 1.25*t), fd.inner(fd.grad(v), n))*fd.ds + 20*nx*fd.inner(u-fd.exp(0.5*x + y + 1.25*t), v)*fd.ds
+
+    def form_mass(u, v):
+        return u*v*fd.dx
+
+    block_sparameters = {
+        'ksp_type': 'preonly',
+        'ksp': {
+            'atol': 1e-5,
+            'rtol': 1e-5,
+        },
+        'pc_type': 'lu',
+    }
+
+    snes_sparameters = {
+        'monitor': None,
+        'converged_reason': None,
+        'atol': 1e-10,
+        'rtol': 1e-12,
+        'stol': 1e-12,
+    }
+
+    # solver parameters for serial method
+    serial_sparameters = {
+        'snes': snes_sparameters
+    }
+    serial_sparameters.update(deepcopy(block_sparameters))
+    serial_sparameters['ksp']['monitor'] = None
+    serial_sparameters['ksp']['converged_reason'] = None
+
+    # solver parameters for parallel method
+    parallel_sparameters = {
+        'snes': snes_sparameters,
+        'mat_type': 'matfree',
+        'ksp_type': 'preonly',
+        'ksp': {
+            'monitor': None,
+            'converged_reason': None,
+        },
+        'pc_type': 'python',
+        'pc_python_type': 'asQ.DiagFFTPC',
+    }
+
+    for i in range(sum(time_partition)):
+        parallel_sparameters['diagfft_block_'+str(i)] = block_sparameters
+
+    block_ctx = {}
+
+    miniapp = ComparisonMiniapp(ensemble, time_partition,
+                                form_mass,
+                                form_function,
+                                w_initial,
+                                dt, theta, alpha,
+                                serial_sparameters,
+                                parallel_sparameters,
+                                circ=None, block_ctx=block_ctx)
+
+    norm0 = fd.norm(w_initial)
+
+    def preproc(serial_app, paradiag, wndw):
+        PETSc.Sys.Print('')
+        PETSc.Sys.Print(f'### === --- Time window {wndw} --- === ###')
+        PETSc.Sys.Print('')
+        PETSc.Sys.Print('=== --- Parallel solve --- ===')
+        PETSc.Sys.Print('')
+
+    def parallel_postproc(pdg, wndw):
+        PETSc.Sys.Print('')
+        PETSc.Sys.Print('=== --- Serial solve --- ===')
+        PETSc.Sys.Print('')
+        return
+
+    PETSc.Sys.Print('')
+    PETSc.Sys.Print('### === --- Timestepping loop --- === ###')
+
+    errors = miniapp.solve(nwindows=nwindows,
+                           preproc=preproc,
+                           parallel_postproc=parallel_postproc)
+
+    PETSc.Sys.Print('')
+    PETSc.Sys.Print('### === --- Errors --- === ###')
+
+    for it, err in enumerate(errors):
+        PETSc.Sys.Print(f'Timestep {it} error: {err/norm0}')
+
+    for err in errors:
+        assert err/norm0 < 1e-5
+
+
 @pytest.mark.parallel(nprocs=4)
 def test_steady_swe():
     # test that steady-state is maintained for shallow water eqs
@@ -243,8 +361,8 @@ def test_steady_swe():
 
     # finite element forms
 
-    def form_function(u, h, v, q):
-        return swe.form_function(mesh, g, b, f, u, h, v, q)
+    def form_function(u, h, v, q, t):
+        return swe.form_function(mesh, g, b, f, u, h, v, q, t)
 
     def form_mass(u, h, v, q):
         return swe.form_mass(mesh, u, h, v, q)
@@ -267,6 +385,8 @@ def form_mass(u, h, v, q):
         'ksp_type': 'gmres',
         'pc_type': 'python',
         'pc_python_type': 'asQ.DiagFFTPC',
+        'diagfft_state': 'initial',
+        'aaos_jacobian_state': 'initial',
     }
 
     M = [2, 2]
@@ -311,6 +431,80 @@ def form_mass(u, h, v, q):
         assert (abs(uerr) < utol)
 
 
+@pytest.mark.parallel(nprocs=4)
+def test_Nitsche_BCs():
+    # test the linear equation u_t - Delta u = 0, with u_ex = exp(0.5*x + y + 1.25*t) and weakly imposing Dirichlet BCs
+    nspatial_domains = 2
+    degree = 1
+    nx = 10
+    dx = 1/nx
+    dt = dx
+    ensemble = fd.Ensemble(fd.COMM_WORLD, nspatial_domains)
+    mesh = fd.UnitSquareMesh(nx, nx, quadrilateral=False, comm=ensemble.comm)
+
+    x, y = fd.SpatialCoordinate(mesh)
+    n = fd.FacetNormal(mesh)
+    V = fd.FunctionSpace(mesh, "CG", degree)
+
+    w0 = fd.Function(V)
+    w0.interpolate(fd.exp(0.5*x + y))
+
+    def form_mass(q, phi):
+        return phi*q*fd.dx
+
+    def form_function(q, phi, t):
+        return fd.inner(fd.grad(q), fd.grad(phi))*fd.dx - fd.inner(phi, fd.inner(fd.grad(q), n))*fd.ds - fd.inner(q-fd.exp(0.5*x + y + 1.25*t), fd.inner(fd.grad(phi), n))*fd.ds + 20*nx*fd.inner(q-fd.exp(0.5*x + y + 1.25*t), phi)*fd.ds
+
+    # Parameters for the diag
+    sparameters = {
+        'ksp_type': 'preonly',
+        'pc_type': 'lu',
+        'pc_factor_mat_solver_type': 'mumps'}
+
+    solver_parameters_diag = {
+        "snes_linesearch_type": "basic",
+        'snes_atol': 1e-8,
+        # 'snes_monitor': None,
+        # 'snes_converged_reason': None,
+        'ksp_rtol': 1e-8,
+        # 'ksp_monitor': None,
+        # 'ksp_converged_reason': None,
+        'mat_type': 'matfree',
+        'ksp_type': 'gmres',
+        'pc_type': 'python',
+        'pc_python_type': 'asQ.DiagFFTPC',
+    }
+
+    M = [2, 2]
+    solver_parameters_diag["diagfft_block_"] = sparameters
+
+    alpha = 1.0e-3
+    theta = 0.5
+
+    PD = asQ.paradiag(ensemble=ensemble,
+                      form_function=form_function,
+                      form_mass=form_mass, w0=w0,
+                      dt=dt, theta=theta,
+                      alpha=alpha,
+                      time_partition=M, bcs=[], solver_parameters=solver_parameters_diag,
+                      circ=None, tol=1.0e-6, maxits=None,
+                      ctx={}, block_mat_type="aij")
+    PD.solve()
+
+    # check against initial conditions
+    time = tuple(fd.Constant(0) for _ in range(2))
+    q_exact = tuple(fd.Function(V) for _ in range(2))
+    for n in range(2):
+        time[n].assign(time[n] + dt*(PD.aaos.transform_index(n, from_range='slice', to_range='window') + 1))
+        q_exact[n].interpolate(fd.exp(.5*x + y + 1.25*time[n]))
+
+    walls = PD.aaos.w_all.split()
+    for step in range(2):
+        qp = walls[step]
+        qerr = fd.errornorm(qp, q_exact[step])
+        assert (qerr < (dx)**(3/2))
+
+
 @pytest.mark.parallel(nprocs=6)
 def test_jacobian_heat_equation():
     # tests the basic snes setup
@@ -341,7 +535,7 @@ def test_jacobian_heat_equation():
                          # 'snes_view': None
                          }
 
-    def form_function(u, v):
+    def form_function(u, v, t):
         return fd.inner(fd.grad(u), fd.grad(v)) * fd.dx
 
     def form_mass(u, v):
@@ -379,11 +573,16 @@ def test_set_para_form():
     theta = 0.5
     alpha = 0.001
     M = [2, 2, 2]
+    Ml = np.sum(M)
+    time = tuple(fd.Constant(0) for _ in range(Ml))
+    for i in range(Ml):
+        time[i].assign(fd.Constant(i*dt))
+
     solver_parameters = {'ksp_type': 'gmres', 'pc_type': 'none',
                          'ksp_rtol': 1.0e-8, 'ksp_atol': 1.0e-8,
                          'ksp_monitor': None}
 
-    def form_function(u, v):
+    def form_function(u, v, t):
         return fd.inner(fd.grad(u), fd.grad(v))*fd.dx
 
     def form_mass(u, v):
@@ -400,11 +599,11 @@ def form_mass(u, v):
                       ctx={}, block_mat_type="aij")
 
     # sequential assembly
-    WFull = V * V * V * V * V * V * V * V
+    WFull = reduce(mul, (V for _ in range(Ml)))
     ufull = fd.Function(WFull)
     np.random.seed(132574)
     ufull_list = ufull.subfunctions
-    for i in range(8):
+    for i in range(6):
         ufull_list[i].dat.data[:] = np.random.randn(*(ufull_list[i].dat.data.shape))
 
     rT = ensemble.ensemble_comm.rank
@@ -420,14 +619,14 @@ def form_mass(u, v):
     vfull = fd.TestFunction(WFull)
     ufulls = fd.split(ufull)
     vfulls = fd.split(vfull)
-    for i in range(8):
+    for i in range(Ml):
         if i == 0:
             un = u0
         else:
             un = ufulls[i-1]
         unp1 = ufulls[i]
         v = vfulls[i]
-        tform = form_mass(unp1 - un, v/dt) + form_function((unp1+un)/2, v)
+        tform = form_mass(unp1 - un, v/dt) + form_function(unp1/2, v, time[i]) + form_function(un/2, v, time[i]-dt)
         if i == 0:
             fullform = tform
         else:
@@ -469,11 +668,16 @@ def test_set_para_form_mixed_parallel():
     theta = 0.5
     alpha = 0.001
     M = [2, 2, 2]
+    Ml = np.sum(M)
+    time = tuple(fd.Constant(0) for _ in range(Ml))
+    for i in range(Ml):
+        time[i].assign(fd.Constant(i*dt))
+
     solver_parameters = {'ksp_type': 'gmres', 'pc_type': 'none',
                          'ksp_rtol': 1.0e-8, 'ksp_atol': 1.0e-8,
                          'ksp_monitor': None}
 
-    def form_function(uu, up, vu, vp):
+    def form_function(uu, up, vu, vp, t):
         return (fd.div(vu)*up - fd.div(uu)*vp)*fd.dx
 
     def form_mass(uu, up, vu, vp):
@@ -490,11 +694,11 @@ def form_mass(uu, up, vu, vp):
                       ctx={}, block_mat_type="aij")
 
     # sequential assembly
-    WFull = W * W * W * W * W * W * W * W
+    WFull = reduce(mul, (W for _ in range(Ml)))
     ufull = fd.Function(WFull)
     np.random.seed(132574)
     ufull_list = ufull.subfunctions
-    for i in range((2*8)):
+    for i in range((2*6)):
         ufull_list[i].dat.data[:] = np.random.randn(*(ufull_list[i].dat.data.shape))
 
     rT = ensemble.ensemble_comm.rank
@@ -514,7 +718,7 @@ def form_mass(uu, up, vu, vp):
     ufulls = fd.split(ufull)
     vfulls = fd.split(vfull)
 
-    for i in range(8):
+    for i in range(Ml):
         if i == 0:
             un = u0
             pn = p0
@@ -527,7 +731,8 @@ def form_mass(uu, up, vu, vp):
         vp = vfulls[2 * i + 1]
         # forms have 2 components and 2 test functions: (u, h, w, phi)
         tform = form_mass(unp1 - un, pnp1 - pn, vu / dt, vp / dt) \
-            + form_function((unp1 + un) / 2, (pnp1 + pn) / 2, vu, vp)
+            + form_function(unp1 / 2, pnp1 / 2, vu, vp, time[i])\
+            + form_function(un / 2, pn / 2, vu, vp, time[i] - dt)
         if i == 0:
             fullform = tform
         else:
@@ -561,11 +766,15 @@ def test_jacobian_mixed_parallel():
     u0 = w0.subfunctions[0]
     p0 = w0.subfunctions[1]
     p0.interpolate(fd.exp(-((x - 0.5) ** 2 + (y - 0.5) ** 2) / 0.5 ** 2))
+    M = [2, 2, 2]
+    Ml = np.sum(M)
     dt = 0.01
+    time = tuple(fd.Constant(0) for _ in range(Ml))
+    for i in range(Ml):
+        time[i].assign(fd.Constant(i*dt))
+
     theta = 0.5
     alpha = 0.001
-    M = [2, 2, 2]
-    Ml = np.sum(M)
     c = fd.Constant(0.1)
     eps = fd.Constant(0.001)
 
@@ -573,7 +782,7 @@ def test_jacobian_mixed_parallel():
                          'ksp_rtol': 1.0e-8, 'ksp_atol': 1.0e-8,
                          'ksp_monitor': None}
 
-    def form_function(uu, up, vu, vp):
+    def form_function(uu, up, vu, vp, t):
         return (fd.div(vu) * up
                 + c * fd.sqrt(fd.inner(uu, uu) + eps) * fd.inner(uu, vu)
                 - fd.div(uu) * vp) * fd.dx
@@ -630,7 +839,8 @@ def form_mass(uu, up, vu, vp):
     PD.aaos.update(PD.X)
 
     # use PD to calculate the Jacobian
-    Jac1 = PD.aaos.jacobian
+    Jac1 = PD.jacobian
+    Jac1.update()  # updates Jacobian state from aaos
 
     # construct Petsc vector X1, Y1:
     nlocal = M[rT]*W.node_set.size  # local times x local space
@@ -664,8 +874,8 @@ def form_mass(uu, up, vu, vp):
         vu, vp = tfulls[2*i: 2*i + 2]
 
         tform = form_mass(unp1 - un, pnp1 - pn, vu / dt, vp / dt) \
-            + 0.5*form_function(unp1, pnp1, vu, vp) \
-            + 0.5*form_function(un, pn, vu, vp)
+            + 0.5*form_function(unp1, pnp1, vu, vp, time[i]) \
+            + 0.5*form_function(un, pn, vu, vp, time[i] - dt)
         if i == 0:
             fullform = tform
         else:
@@ -725,6 +935,7 @@ def test_solve_para_form(bc_opt, extruded):
     c = fd.Constant(1)
     M = [2, 2, 2]
     Ml = np.sum(M)
+    time = 0.01
 
     # Parameters for the diag
     sparameters = {
@@ -748,7 +959,7 @@ def test_solve_para_form(bc_opt, extruded):
     for i in range(sum(M)):
         solver_parameters_diag[f"diagfft_block_{i}_"] = sparameters
 
-    def form_function(u, v):
+    def form_function(u, v, t):
         return fd.inner((1.+c*fd.inner(u, u))*fd.grad(u), fd.grad(v))*fd.dx
 
     def form_mass(u, v):
@@ -778,10 +989,9 @@ def form_mass(u, v):
 
     un.assign(u0)
     v = fd.TestFunction(V)
-
     eqn = (unp1 - un)*v*fd.dx
-    eqn += fd.Constant(dt*(1-theta))*form_function(un, v)
-    eqn += fd.Constant(dt*theta)*form_function(unp1, v)
+    eqn += fd.Constant(dt*(1-theta))*form_function(un, v, time - dt)
+    eqn += fd.Constant(dt*theta)*form_function(unp1, v, time)
 
     sprob = fd.NonlinearVariationalProblem(eqn, unp1, bcs=bcs)
     solver_parameters = {'ksp_type': 'preonly', 'pc_type': 'lu'}
@@ -796,6 +1006,7 @@ def form_mass(u, v):
 
     for i in range(Ml):
         ssolver.solve()
+        time += dt
         vfull_list[i].assign(unp1)
         un.assign(unp1)
 
@@ -866,6 +1077,7 @@ def test_solve_para_form_mixed(extruded):
 
     M = [2, 2, 2]
     Ml = np.sum(M)
+    time = 0.01
 
     # Parameters for the diag
     sparameters = {
@@ -887,7 +1099,7 @@ def test_solve_para_form_mixed(extruded):
 
     solver_parameters_diag["diagfft_block_"] = sparameters
 
-    def form_function(uu, up, vu, vp):
+    def form_function(uu, up, vu, vp, t):
         return (fd.div(vu) * up + c * fd.sqrt(fd.inner(uu, uu) + eps) * fd.inner(uu, vu)
                 - fd.div(uu) * vp) * fd.dx
 
@@ -914,9 +1126,9 @@ def form_mass(uu, up, vu, vp):
     eqn = form_mass(*(fd.split(unp1)), *(fd.split(v)))
     eqn -= form_mass(*(fd.split(un)), *(fd.split(v)))
     eqn += fd.Constant(dt*(1-theta))*form_function(*(fd.split(un)),
-                                                   *(fd.split(v)))
+                                                   *(fd.split(v)), time - dt)
     eqn += fd.Constant(dt*theta)*form_function(*(fd.split(unp1)),
-                                               *(fd.split(v)))
+                                               *(fd.split(v)), time)
 
     sprob = fd.NonlinearVariationalProblem(eqn, unp1)
     solver_parameters = {'ksp_type': 'preonly', 'pc_type': 'lu',
@@ -935,6 +1147,7 @@ def form_mass(uu, up, vu, vp):
 
     for i in range(Ml):
         ssolver.solve()
+        time += dt
         for k in range(2):
             vfull.sub(2*i+k).assign(unp1.sub(k))
         un.assign(unp1)
@@ -981,7 +1194,7 @@ def test_diagnostics():
     for i in range(np.sum(M)):
         diag_sparameters["diagfft_block_" + str(i) + "_"] = block_sparameters
 
-    def form_function(u, v):
+    def form_function(u, v, t):
         return fd.inner(fd.grad(u), fd.grad(v))*fd.dx
 
     def form_mass(u, v):
diff --git a/utils/serial/miniapp.py b/utils/serial/miniapp.py
index 5c11aecc..500195e1 100644
--- a/utils/serial/miniapp.py
+++ b/utils/serial/miniapp.py
@@ -25,8 +25,8 @@ def __init__(self,
         :arg solver_parameters: options dictionary for nonlinear solver
         '''
         self.dt = dt
+        self.time = fd.Constant(dt)
         self.theta = theta
-
         self.initial_condition = w_initial
         self.function_space = w_initial.function_space()
 
@@ -61,10 +61,9 @@ def set_theta_form(self, form_mass, form_function, dt, theta, w0, w1):
         v = fd.TestFunctions(w0.function_space())
         w1s = fd.split(w1)
         w0s = fd.split(w0)
-
         dqdt = form_mass(*w1s, *v) - form_mass(*w0s, *v)
 
-        L = theta*form_function(*w1s, *v) + (1 - theta)*form_function(*w0s, *v)
+        L = theta*form_function(*w1s, *v, self.time) + (1 - theta)*form_function(*w0s, *v, self.time - dt)
 
         return dt1*dqdt + L
 
@@ -74,16 +73,14 @@ def solve(self, nt,
         '''
         Integrate forward nt timesteps
         '''
-        time = 0
         for step in range(nt):
-            preproc(self, step, time)
+            preproc(self, step, self.time)
 
             self.nlsolver.solve()
             self.w0.assign(self.w1)
+            self.time.assign(self.time + self.dt)
 
-            time += self.dt
-
-            postproc(self, step, time)
+            postproc(self, step, self.time)
 
 
 class ComparisonMiniapp(object):
diff --git a/utils/shallow_water/linear.py b/utils/shallow_water/linear.py
index 07d956a9..81617c72 100644
--- a/utils/shallow_water/linear.py
+++ b/utils/shallow_water/linear.py
@@ -20,11 +20,11 @@ def form_mass(mesh, u, h, w, p):
 
 # spatial forms for depth and velocity fields
 
-def form_function_h(mesh, H, u, h, p):
+def form_function_h(mesh, H, u, h, p, t):
     return (H*p*fd.div(u))*fd.dx
 
 
-def form_function_u(mesh, g, f, u, h, w, perp=fd.cross):
+def form_function_u(mesh, g, f, u, h, w, t, perp=fd.cross):
 
     outward_normals = fd.CellNormal(mesh)
 
@@ -34,5 +34,5 @@ def prp(u):
     return (fd.inner(w, f*prp(u)) - g*h*fd.div(w))*fd.dx
 
 
-def form_function(mesh, g, H, f, u, h, w, p):
-    return form_function_h(mesh, H, u, h, p) + form_function_u(mesh, g, f, u, h, w)
+def form_function(mesh, g, H, f, u, h, w, p, t):
+    return form_function_h(mesh, H, u, h, p, t) + form_function_u(mesh, g, f, u, h, w, t)
diff --git a/utils/shallow_water/miniapp.py b/utils/shallow_water/miniapp.py
index 6da04787..6eb16ea9 100644
--- a/utils/shallow_water/miniapp.py
+++ b/utils/shallow_water/miniapp.py
@@ -20,6 +20,7 @@ def __init__(self,
                  coriolis_expression=swe.earth_coriolis_expression,
                  block_ctx={},
                  reference_depth=0,
+                 reference_state=False,
                  linear=False,
                  velocity_function_space=swe.default_velocity_function_space,
                  depth_function_space=swe.default_depth_function_space,
@@ -38,9 +39,10 @@ def __init__(self,
         :arg theta: parameter for the implicit theta-method integrator
         :arg alpha: value used for the alpha-circulant approximation in the paradiag method.
         :arg time_partition: a list with how many timesteps are on each of the ensemble time-ranks.
-        arg :paradiag_sparameters: a dictionary of PETSc solver parameters for the solution of the all-at-once system
+            arg :paradiag_sparameters: a dictionary of PETSc solver parameters for the solution of the all-at-once system
         :arg block_ctx: a dictionary of extra values required for the block system solvers.
         :arg reference_depth: constant used to calculate elevation
+        :arg reference_state: Whether to create a reference state for the AllAtOnceSystem
         :arg linear: if False, solve nonlinear shallow water equations, if True solve linear equations
         :arg velocity_function_space: function to return a firedrake FunctionSpace for the velocity field, given a mesh
         :arg depth_function_space: function to return a firedrake FunctionSpace for the depth field, given a mesh
@@ -71,23 +73,22 @@ def __init__(self,
         self.topography_function = fd.Function(self.depth_function_space(),
                                                name='topography')
         self.topography_function.interpolate(self.topography)
-
         if linear:
-            def form_function(u, h, v, q):
+            def form_function(u, h, v, q, t):
                 g = self.gravity
                 H = self.reference_depth
                 f = self.coriolis
-                return swe.linear.form_function(self.mesh, g, H, f, u, h, v, q)
+                return swe.linear.form_function(self.mesh, g, H, f, u, h, v, q, t)
 
             def form_mass(u, h, v, q):
                 return swe.linear.form_mass(self.mesh, u, h, v, q)
 
         else:
-            def form_function(u, h, v, q):
+            def form_function(u, h, v, q, t):
                 g = self.gravity
                 b = self.topography
                 f = self.coriolis
-                return swe.nonlinear.form_function(self.mesh, g, b, f, u, h, v, q)
+                return swe.nonlinear.form_function(self.mesh, g, b, f, u, h, v, q, t)
 
             def form_mass(u, h, v, q):
                 return swe.nonlinear.form_mass(self.mesh, u, h, v, q)
@@ -104,6 +105,12 @@ def form_mass(u, h, v, q):
         u0.project(velocity_expression(*x))
         h0.project(depth_expression(*x))
 
+        if reference_state:
+            self.reference_state = fd.Function(self.function_space())
+            reference_state = self.reference_state
+        else:
+            reference_state = None
+
         # non-petsc information for block solve
 
         # mesh transfer operators
@@ -114,7 +121,7 @@ def form_mass(u, h, v, q):
             tm = mg.manifold_transfer_manager(self.function_space())
             transfer_managers.append(tm)
 
-        block_ctx['diag_transfer_managers'] = transfer_managers
+        block_ctx['diagfft_transfer_managers'] = transfer_managers
 
         self.paradiag = asQ.paradiag(
             ensemble=self.ensemble,
@@ -122,6 +129,7 @@ def form_mass(u, h, v, q):
             form_mass=form_mass,
             w0=w0, dt=dt, theta=theta,
             alpha=alpha, time_partition=time_partition,
+            reference_state=reference_state,
             solver_parameters=paradiag_sparameters,
             circ=None, ctx={}, block_ctx=block_ctx)
 
diff --git a/utils/shallow_water/nonlinear.py b/utils/shallow_water/nonlinear.py
index e42e43d5..6fcc2459 100644
--- a/utils/shallow_water/nonlinear.py
+++ b/utils/shallow_water/nonlinear.py
@@ -21,7 +21,7 @@ def form_mass(mesh, u, h, v, p):
 # spatial forms for depth and velocity fields
 
 
-def form_function_depth(mesh, u, h, p):
+def form_function_depth(mesh, u, h, p, t):
     n = fd.FacetNormal(mesh)
     uup = 0.5 * (fd.dot(u, n) + abs(fd.dot(u, n)))
 
@@ -29,7 +29,7 @@ def form_function_depth(mesh, u, h, p):
             + fd.jump(p)*(uup('+')*h('+') - uup('-')*h('-'))*fd.dS)
 
 
-def form_function_velocity(mesh, g, b, f, u, h, v, perp=fd.cross):
+def form_function_velocity(mesh, g, b, f, u, h, v, t, perp=fd.cross):
     n = fd.FacetNormal(mesh)
     outward_normals = fd.CellNormal(mesh)
 
@@ -49,5 +49,5 @@ def both(u):
             - fd.div(v)*(g*(h + b) + K)*fd.dx)
 
 
-def form_function(mesh, g, b, f, u, h, v, q):
-    return form_function_velocity(mesh, g, b, f, u, h, v) + form_function_depth(mesh, u, h, q)
+def form_function(mesh, g, b, f, u, h, v, q, t):
+    return form_function_velocity(mesh, g, b, f, u, h, v, t) + form_function_depth(mesh, u, h, q, t)

From 0f4128c48bdd295c71d8d24c055c84e2efa4b7fe Mon Sep 17 00:00:00 2001
From: JHopeCollins <joshua.hope-collins13@imperial.ac.uk>
Date: Tue, 27 Jun 2023 10:09:33 +0100
Subject: [PATCH 9/9] remove unsteady-advection script

---
 examples/advection/unsteady_dg_advection.py | 256 --------------------
 1 file changed, 256 deletions(-)
 delete mode 100644 examples/advection/unsteady_dg_advection.py

diff --git a/examples/advection/unsteady_dg_advection.py b/examples/advection/unsteady_dg_advection.py
deleted file mode 100644
index 1f760df4..00000000
--- a/examples/advection/unsteady_dg_advection.py
+++ /dev/null
@@ -1,256 +0,0 @@
-import matplotlib.pyplot as plt
-from matplotlib.animation import FuncAnimation
-from math import pi, cos, sin
-
-import firedrake as fd
-from firedrake.petsc import PETSc
-import asQ
-
-import argparse
-
-parser = argparse.ArgumentParser(
-    description='ParaDiag timestepping for scalar advection of a Gaussian bump in a periodic square with DG in space and implicit-theta in time.\n'
-                +'Advecting velocity varies in time sinusoidally.'
-                +'Based on the Firedrake DG advection example https://www.firedrakeproject.org/demos/DG_advection.py.html',
-    formatter_class=argparse.ArgumentDefaultsHelpFormatter
-)
-parser.add_argument('--nx', type=int, default=64, help='Number of cells along each square side.')
-parser.add_argument('--cfl', type=float, default=0.8, help='Convective CFL number.')
-parser.add_argument('--angle', type=float, default=pi/6, help='Angle of the convective velocity.')
-parser.add_argument('--degree', type=int, default=1, help='Degree of the scalar and velocity spaces.')
-parser.add_argument('--theta', type=float, default=0.5, help='Parameter for the implicit theta timestepping method.')
-parser.add_argument('--width', type=float, default=0.1, help='Width of the Gaussian bump.')
-parser.add_argument('--nwindows', type=int, default=1, help='Number of time-windows.')
-parser.add_argument('--nslices', type=int, default=2, help='Number of time-slices per time-window.')
-parser.add_argument('--slice_length', type=int, default=2, help='Number of timesteps per time-slice.')
-parser.add_argument('--alpha', type=float, default=0.0001, help='Circulant coefficient.')
-parser.add_argument('--nsample', type=int, default=32, help='Number of sample points for plotting.')
-parser.add_argument('--show_args', action='store_true', help='Output all the arguments.')
-
-args = parser.parse_known_args()
-args = args[0]
-
-if args.show_args:
-    PETSc.Sys.Print(args)
-
-# The time partition describes how many timesteps are included on each time-slice of the ensemble
-# Here we use the same number of timesteps on each slice, but they can be different
-
-time_partition = tuple(args.slice_length for _ in range(args.nslices))
-window_length = sum(time_partition)
-nsteps = args.nwindows*window_length
-
-# Calculate the timestep from the CFL number
-umax = 1.
-dx = 1./args.nx
-dt = args.cfl*dx/umax
-
-# timescale of the domain
-T = 1./umax
-
-period = 0.5*T
-
-# The Ensemble with the spatial and time communicators
-ensemble = asQ.create_ensemble(time_partition)
-
-# # # === --- domain --- === # # #
-
-# The mesh needs to be created with the spatial communicator
-mesh = fd.PeriodicUnitSquareMesh(args.nx, args.nx, quadrilateral=True, comm=ensemble.comm)
-
-# We use a discontinuous Galerkin space for the advected scalar
-# and a continuous Galerkin space for the advecting velocity field
-V = fd.FunctionSpace(mesh, "DQ", args.degree)
-W = fd.VectorFunctionSpace(mesh, "CG", args.degree)
-
-# # # === --- initial conditions --- === # # #
-
-x, y = fd.SpatialCoordinate(mesh)
-
-
-def radius(x, y):
-    return fd.sqrt(pow(x-0.5, 2) + pow(y-0.5, 2))
-
-
-def gaussian(x, y):
-    return fd.exp(-0.5*pow(radius(x, y)/args.width, 2))
-
-
-# The scalar initial conditions are a Gaussian bump centred at (0.5, 0.5)
-q0 = fd.Function(V, name="scalar_initial")
-q0.interpolate(1 + gaussian(x, y))
-
-# The advecting velocity field is constant and directed at an angle to the x-axis
-u = fd.Function(W, name='velocity')
-u.interpolate(0.5*fd.as_vector((umax*cos(args.angle), umax*sin(args.angle))))
-
-# time-varying perturbation to advecting velocity
-up = fd.Function(W, name='velocity-perturbation')
-up.interpolate(fd.as_vector((1.0*umax, -1.0*umax)))
-
-# # # === --- finite element forms --- === # # #
-
-
-# The time-derivative mass form for the scalar advection equation.
-# asQ assumes that the mass form is linear so here
-# q is a TrialFunction and phi is a TestFunction
-def form_mass(q, phi):
-    return phi*q*fd.dx
-
-
-# The DG advection form for the scalar advection equation.
-# asQ assumes that the function form is nonlinear so here
-# q is a Function and phi is a TestFunction
-def form_function(q, phi, t):
-    # upwind switch
-    v = u + up*fd.cos(2*pi*t/T)
-    n = fd.FacetNormal(mesh)
-    un = 0.5*(fd.dot(v, n) + abs(fd.dot(v, n)))
-
-    # integration over element volume
-    int_cell = q*fd.div(phi*v)*fd.dx
-
-    # integration over internal facets
-    int_facet = (phi('+')-phi('-'))*(un('+')*q('+')-un('-')*q('-'))*fd.dS
-
-    return int_facet - int_cell
-
-
-# # # === --- PETSc solver parameters --- === # # #
-
-
-# The PETSc solver parameters used to solve the
-# blocks in step (b) of inverting the ParaDiag matrix.
-block_parameters = {
-    'ksp_type': 'preonly',
-    'pc_type': 'lu',
-}
-
-# The PETSc solver parameters for solving the all-at-once system.
-# The python preconditioner 'asQ.DiagFFTPC' applies the ParaDiag matrix.
-#
-# The equation is linear so we can either:
-# a) Solve it in one shot using a preconditioned Krylov method:
-#    P^{-1}Au = P^{-1}b
-#    The solver options for this are:
-#    'ksp_type': 'fgmres'
-#    We need fgmres here because gmres is used on the blocks.
-# b) Solve it with Picard iterations:
-#    Pu_{k+1} = (P - A)u_{k} + b
-#    The solver options for this are:
-#    'ksp_type': 'preonly'
-
-paradiag_parameters = {
-    'snes_type': 'ksponly',
-    'snes': {
-        'monitor': None,
-        'converged_reason': None,
-        'rtol': 1e-8,
-    },
-    'mat_type': 'matfree',
-    'ksp_type': 'gmres',
-    'ksp': {
-        'monitor': None,
-        'converged_reason': None,
-        'rtol': 1e-8,
-    },
-    'pc_type': 'python',
-    'pc_python_type': 'asQ.DiagFFTPC'
-}
-
-# We need to add a block solver parameters dictionary for each block.
-# Here they are all the same but they could be different.
-for i in range(window_length):
-    paradiag_parameters['diagfft_block_'+str(i)+'_'] = block_parameters
-
-
-# # # === --- Setup ParaDiag --- === # # #
-
-
-# Give everything to asQ to create the paradiag object.
-# the circ parameter determines where the alpha-circulant
-# approximation is introduced. None means only in the preconditioner.
-pdg = asQ.paradiag(ensemble=ensemble,
-                   form_function=form_function,
-                   form_mass=form_mass,
-                   w0=q0, dt=dt, theta=args.theta,
-                   alpha=args.alpha, time_partition=time_partition,
-                   solver_parameters=paradiag_parameters,
-                   circ=None)
-
-
-# This is a callback which will be called before pdg solves each time-window
-# We can use this to make the output a bit easier to read
-def window_preproc(pdg, wndw):
-    PETSc.Sys.Print('')
-    PETSc.Sys.Print(f'### === --- Calculating time-window {wndw} --- === ###')
-    PETSc.Sys.Print('')
-
-
-# The last time-slice will be saving snapshots to create an animation.
-# The layout member describes the time_partition.
-# layout.is_local(i) returns True/False if the timestep index i is on the
-# current time-slice. Here we use -1 to mean the last timestep in the window.
-is_last_slice = pdg.layout.is_local(-1)
-
-# Make an output Function on the last time-slice and start a snapshot list
-if is_last_slice:
-    qout = fd.Function(V)
-    timeseries = [q0.copy(deepcopy=True)]
-
-
-# This is a callback which will be called after pdg solves each time-window
-# We can use this to save the last timestep of each window for plotting.
-def window_postproc(pdg, wndw):
-    if is_last_slice:
-        # The aaos is the AllAtOnceSystem which represents the time-dependent problem.
-        # get_field extracts one timestep of the window. -1 is again used to get the last
-        # timestep and place it in qout.
-        pdg.aaos.get_field(-1, index_range='window', wout=qout)
-        timeseries.append(qout.copy(deepcopy=True))
-
-
-# Solve nwindows of the all-at-once system
-pdg.solve(args.nwindows,
-          preproc=window_preproc,
-          postproc=window_postproc)
-
-
-# # # === --- Postprocessing --- === # # #
-
-# paradiag collects a few solver diagnostics for us to inspect
-nw = args.nwindows
-
-# Number of nonlinear iterations, total and per window.
-# (1 for fgmres and # picard iterations for preonly)
-PETSc.Sys.Print(f'nonlinear iterations: {pdg.nonlinear_iterations}  |  iterations per window: {pdg.nonlinear_iterations/nw}')
-
-# Number of linear iterations, total and per window.
-# (# of gmres iterations for fgmres and # picard iterations for preonly)
-PETSc.Sys.Print(f'linear iterations: {pdg.linear_iterations}  |  iterations per window: {pdg.linear_iterations/nw}')
-
-# Number of iterations needed for each block in step-(b), total and per block solve
-# The number of iterations for each block will usually be different because of the different eigenvalues
-PETSc.Sys.Print(f'block linear iterations: {pdg.block_iterations._data}  |  iterations per block solve: {pdg.block_iterations._data/pdg.linear_iterations}')
-
-# We can write these diagnostics to file, along with some other useful information.
-# Files written are: aaos_metrics.txt, block_metrics.txt, paradiag_setup.txt, solver_parameters.txt
-asQ.write_paradiag_metrics(pdg, directory='metrics')
-
-# Make an animation from the snapshots we collected and save it to periodic.mp4.
-if is_last_slice:
-
-    fn_plotter = fd.FunctionPlotter(mesh, num_sample_points=args.nsample)
-
-    fig, axes = plt.subplots()
-    axes.set_aspect('equal')
-    colors = fd.tripcolor(qout, num_sample_points=args.nsample, vmin=1, vmax=2, axes=axes)
-    fig.colorbar(colors)
-
-    def animate(q):
-        colors.set_array(fn_plotter(q))
-
-    interval = 4e2
-    animation = FuncAnimation(fig, animate, frames=timeseries, interval=interval)
-
-    animation.save("periodic.mp4", writer="ffmpeg")