diff --git a/README.md b/README.md index e7803d6..e7765df 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ An optimization history file 'optim.dat' will be flushed to the examples subfold ## Contributors -* Stefanie Guenther +* Stefanie Guenther * Eric C. Cyr * J.B. Schroder * Roland A. Siegbert diff --git a/examples/peaks/peaks.cfg b/examples/peaks/peaks.cfg index 3599df6..542c003 100644 --- a/examples/peaks/peaks.cfg +++ b/examples/peaks/peaks.cfg @@ -3,7 +3,7 @@ ################################ # relative data folder location -datafolder = examples/peaks +datafolder = ./ # filename of training data feature vectors ftrain_ex = features_training.dat # filename of training data labels/classes @@ -47,7 +47,7 @@ type_openlayer = activate # factor for scaling initial opening layer weights and bias weights_open_init = 1e-3 # factor for scaling initial weights and bias of intermediate layers -weights_init = 0e-3 +weights_init = 1e-3 # factor for scaling initial classification weights and bias weights_class_init = 1e-3 @@ -66,7 +66,7 @@ braid_maxlevels = 10 # minimum allowed coarse time time grid size (values in 10-30 are usually best) braid_mincoarse = 10 # maximum number of iterations -braid_maxiter = 15 +braid_maxiter = 2 # absolute tolerance braid_abstol = 1e-15 # absolute adjoint tolerance @@ -100,7 +100,7 @@ nbatch = 5000 # relaxation param for tikhonov term gamma_tik = 1e-7 # relaxation param for time-derivative term -gamma_ddt = 1e-7 +gamma_ddt = 1e-5 # relaxation param for tikhonov term of classification weights gamma_class = 1e-7 # stepsize selection type ("fixed" or "backtrackingLS" or "oneoverk") @@ -112,19 +112,19 @@ stepsize_type = backtrackingLS # initial stepsize stepsize = 1.0 # maximum number of optimization iterations -optim_maxiter = 10 +optim_maxiter = 130 # absolute stopping criterion for the gradient norm gtol = 1e-4 # maximum number of linesearch iterations -ls_maxiter = 20 +ls_maxiter = 15 # factor for modifying the stepsize within a linesearch iteration ls_factor = 0.5 # Hessian Approximation ("BFGS", "L-BFGS" or "Identity") hessian_approx = L-BFGS # number of stages for l-bfgs method -lbfgs_stages = 20 +lbfgs_stages = 10 # level for validation computation: # -1 = never validate # 0 = validate only after optimization finishes. # 1 = validate in each optimization iteration -validationlevel = 0 +validationlevel = 1 diff --git a/include/braid_wrapper.hpp b/include/braid_wrapper.hpp index 91bedb5..5af32a3 100644 --- a/include/braid_wrapper.hpp +++ b/include/braid_wrapper.hpp @@ -10,15 +10,14 @@ #pragma once /** - * Define the state vector at one time-step + * Define the network state one layer. It contains the transformed data batch in the vector **state, and a pointer to the actual layer. */ class myBraidVector { protected: int nbatch; /* Number of examples */ int nchannels; /* Number of channels */ - MyReal * - *state; /* Network state at one layer, dimensions: nbatch * nchannels */ + MyReal **state; /* Network state at one layer, dimensions: nbatch * nchannels */ Layer *layer; /* Pointer to layer information */ /* Flag that determines if the layer and state have just been received and @@ -51,8 +50,9 @@ class myBraidVector { }; /** - * Wrapper for the primal braid app. - * virtual function are overwritten from the adjoint app class + * Wrapper for the primal braid app. Most important routines are the Step function, which applies the layer transformations (and hence steps forward to the next layer), the SetInitialCondition, which applies the opening layer and the EvaluateObjective function, which (surprise!) evaluates the loss function and adds the regularization terms to get the objective function value. + * + * The adjoint braid app inherits from this class, and overwrites those with the corresponding derivatives. */ class myBraidApp : public BraidApp { protected: diff --git a/include/layer.hpp b/include/layer.hpp index 036ead8..bc0ea6c 100644 --- a/include/layer.hpp +++ b/include/layer.hpp @@ -62,7 +62,8 @@ class Layer { /* Set time step size */ void setDt(MyReal DT); - /* Set design and gradient memory location */ + /* Set design and gradient memory location. + * The design vector is allocated within the Network block. For each layer in the block, the local memory location within the network's design vector is passed here to the layer, and stored as *weights and *bias (and their derivatives weights_bar and bias_bar). */ void setMemory(MyReal *design_memloc, MyReal *gradient_memloc); /* Some Get..() functions */ @@ -90,13 +91,13 @@ class Layer { int getnConv(); int getCSize(); - /* Get the layer index (i.e. the time step) */ + /* Get the layers ID (i.e. the time step number) */ int getIndex(); /* Prints to screen */ void print_data(MyReal *data_Out); - /* Activation function and derivative */ + /* Applies the activation function and derivative */ MyReal activation(MyReal x); MyReal dactivation(MyReal x); diff --git a/include/network.hpp b/include/network.hpp index 3b6996d..2337263 100644 --- a/include/network.hpp +++ b/include/network.hpp @@ -9,13 +9,20 @@ #include "util.hpp" #pragma once +/* + * The Network class logically connects the layers. + * Each processor instantiates one object of this class containing + * a sub-block of layers from [startlayerID, endlayerID], where those ID's are anything between -1 (being the opening layer) and nlayers_global-1 (being the classification layer). The distribution for the actual startlayerIDs and endlayerIDs at each processor come from Xbraid. + * All layers are stored in the vector **layer, except for the opening layer, which is in *openlayer. + * Each network block contains (and allocates!) the *design and *gradient vector, which are the vectorized weights and biases at each layer (see createNetworkBlock). + */ class Network { protected: int nlayers_global; /* Total number of Layers of the network */ int nlayers_local; /* Number of Layers in this network block */ int nchannels; /* Width of the network */ - MyReal dt; /* Time step size */ + MyReal dt; /* Time step size (distance between two layers).*/ MyReal loss; /* Value of the loss function */ MyReal accuracy; /* Accuracy of the network prediction (percentage of successfully predicted classes) */ @@ -44,6 +51,10 @@ class Network { ~Network(); + /* + * This calls the layer's constructor for all layers in [StartlayerID, EndLayerID]. + * + * */ void createNetworkBlock(int StartLayerID, int EndLayerID, Config *config, MPI_Comm Comm); @@ -105,6 +116,8 @@ class Network { */ void setInitialDesign(Config *config); + /* Helper function for createNetworkBlock. It basically checks what kind of layer is required at this index and calls the corresponding layer constructor. + */ Layer *createLayer(int index, Config *config); /* Replace the layer with one that is received from the left neighbouring @@ -113,6 +126,8 @@ class Network { /** * Applies the classification and evaluates loss/accuracy + * This routine should only be called at the last processor, which contains the classification layer. + * Maybe this one should not be inside the Network class? Don't know. */ void evalClassification(DataSet *data, MyReal **state, int output); @@ -125,6 +140,7 @@ class Network { /** * Update the network design parameters: new_design = old_design + stepsize * * direction + * I guess this might rather be a routine for the optimizer... */ void updateDesign(MyReal stepsize, MyReal *direction, MPI_Comm comm); }; diff --git a/src/main.cpp b/src/main.cpp index a71b812..68965f0 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -77,15 +77,14 @@ int main(int argc, char *argv[]) { MyReal ls_objective, test_obj; int ls_iter; - /* --- other --- */ - // TODO: What is this? Why do you need it? - int myid; - int size; + /* --- Time measurements --- */ struct rusage r_usage; MyReal StartTime, StopTime, myMB, globalMB; MyReal UsedTime = 0.0; /* Initialize MPI */ + int myid; + int size; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &myid); MPI_Comm_size(MPI_COMM_WORLD, &size); @@ -146,7 +145,7 @@ int main(int argc, char *argv[]) { ndesign_global); /* Initialize Hessian approximation */ - HessianApprox *hessian = 0; + HessianApprox *hessian = NULL; switch (config->hessianapprox_type) { case BFGS_SERIAL: hessian = new BFGS(MPI_COMM_WORLD, ndesign_local); @@ -162,8 +161,6 @@ int main(int argc, char *argv[]) { return 0; } - /* Allocate ascent direction for design updates */ - /* Initialize optimization parameters */ ascentdir = new MyReal[ndesign_local]; stepsize = config->getStepsize(0);