From 9f3ca5e87dbe32521f8d73dbc9989669455f7683 Mon Sep 17 00:00:00 2001 From: Neo-X Date: Sun, 12 Jul 2020 12:59:06 -0700 Subject: [PATCH 01/18] Adding setup --- requirements.txt | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 000000000..375051042 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,31 @@ +glfw>=1.4.0 +numpy>=1.11 +Cython>=0.27.2 +imageio>=2.1.2 +cffi>=1.10 +imagehash>=3.4 +ipdb +Pillow>=4.0.0 +pycparser>=2.17.0 +pytest>=3.0.5 +pytest-instafail==0.3.0 +scipy>=0.18.0 +sphinx +sphinx_rtd_theme +numpydoc +cloudpickle==0.5.2 +cached-property==1.3.1 +gym[all]==0.10.5 +gitpython==2.1.7 +gtimer==1.0.0b5 +awscli==1.11.179 +boto3==1.4.8 +ray==0.2.2 +path.py==10.3.1 +http://download.pytorch.org/whl/cu90/torch-0.4.1-cp35-cp35m-linux_x86_64.whl +joblib==0.9.4 +opencv-python==3.4.0.12 +torchvision==0.2.0 +sk-video==1.1.10 +git+https://github.com/vitchyr/multiworld.git +moviepy From 6dc9b9e6f297baf49979d96f177a232467bb5836 Mon Sep 17 00:00:00 2001 From: Neo-X Date: Sun, 12 Jul 2020 14:02:34 -0700 Subject: [PATCH 02/18] adding comet logging --- rlkit/core/logging.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/rlkit/core/logging.py b/rlkit/core/logging.py index 3aa949705..914662412 100644 --- a/rlkit/core/logging.py +++ b/rlkit/core/logging.py @@ -90,11 +90,18 @@ def __init__(self): self._log_tabular_only = False self._header_printed = False + self._comet_log = None self.table_printer = TerminalTablePrinter() def reset(self): self.__init__() + def set_comet_logger(self, log): + self._comet_log = log + + def get_comet_logger(self): + return self._comet_log + def _add_output(self, file_name, arr, fds, mode='a'): if file_name not in arr: mkdir_p(os.path.dirname(file_name)) @@ -173,6 +180,9 @@ def log(self, s, with_prefix=True, with_timestamp=True): def record_tabular(self, key, val): self._tabular.append((self._tabular_prefix_str + str(key), str(val))) + if (self._comet_log is not None): + self._comet_log.log_metrics({str(self._tabular_prefix_str) + str(key): val}) +# logger.set_step(step=settings["round"]) def record_dict(self, d, prefix=None): if prefix is not None: From 92db3eeb5d8d40e22ca7b6a4b71a29d7c2d08f43 Mon Sep 17 00:00:00 2001 From: Neo-X Date: Wed, 15 Jul 2020 15:46:46 -0700 Subject: [PATCH 03/18] Working on requirements --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 375051042..1405e6011 100644 --- a/requirements.txt +++ b/requirements.txt @@ -29,3 +29,4 @@ torchvision==0.2.0 sk-video==1.1.10 git+https://github.com/vitchyr/multiworld.git moviepy +comet_ml From 1c4130427794428cc37c25128d49fd0e24bff3fa Mon Sep 17 00:00:00 2001 From: Neo-X Date: Wed, 15 Jul 2020 21:58:45 -0700 Subject: [PATCH 04/18] Working on requirements and not needing mujoco --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 1405e6011..b1b02ce9d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,6 +27,6 @@ joblib==0.9.4 opencv-python==3.4.0.12 torchvision==0.2.0 sk-video==1.1.10 -git+https://github.com/vitchyr/multiworld.git +# git+https://github.com/vitchyr/multiworld.git moviepy comet_ml From eddb596da55a57e312f13cc8679103ff97d37193 Mon Sep 17 00:00:00 2001 From: Neo-X Date: Wed, 15 Jul 2020 22:04:44 -0700 Subject: [PATCH 05/18] Working on requirements and not needing mujoco --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b1b02ce9d..439c6be62 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,7 @@ sphinx_rtd_theme numpydoc cloudpickle==0.5.2 cached-property==1.3.1 -gym[all]==0.10.5 +gym==0.10.5 gitpython==2.1.7 gtimer==1.0.0b5 awscli==1.11.179 From 58326b0fe0125e6d9d433c8db6c6fd7841271a5f Mon Sep 17 00:00:00 2001 From: Neo-X Date: Wed, 17 Feb 2021 12:39:39 -0800 Subject: [PATCH 06/18] Editing bufferes to save memeory --- rlkit/data_management/env_replay_buffer.py | 3 ++- rlkit/data_management/simple_replay_buffer.py | 5 +++-- rlkit/samplers/data_collector/path_collector.py | 1 + 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/rlkit/data_management/env_replay_buffer.py b/rlkit/data_management/env_replay_buffer.py index 93c43ee9e..fb615bb33 100644 --- a/rlkit/data_management/env_replay_buffer.py +++ b/rlkit/data_management/env_replay_buffer.py @@ -10,7 +10,8 @@ def __init__( self, max_replay_buffer_size, env, - env_info_sizes=None + env_info_sizes=None, + dtype="float32" ): """ :param max_replay_buffer_size: diff --git a/rlkit/data_management/simple_replay_buffer.py b/rlkit/data_management/simple_replay_buffer.py index 424761bf4..6aa6ad3d3 100644 --- a/rlkit/data_management/simple_replay_buffer.py +++ b/rlkit/data_management/simple_replay_buffer.py @@ -13,15 +13,16 @@ def __init__( observation_dim, action_dim, env_info_sizes, + dtype="float32" ): self._observation_dim = observation_dim self._action_dim = action_dim self._max_replay_buffer_size = max_replay_buffer_size - self._observations = np.zeros((max_replay_buffer_size, observation_dim)) + self._observations = np.zeros((max_replay_buffer_size, observation_dim), dtype=dtype) # It's a bit memory inefficient to save the observations twice, # but it makes the code *much* easier since you no longer have to # worry about termination conditions. - self._next_obs = np.zeros((max_replay_buffer_size, observation_dim)) + self._next_obs = np.zeros((max_replay_buffer_size, observation_dim), dtype=dtype) self._actions = np.zeros((max_replay_buffer_size, action_dim)) # Make everything a 2D np array to make it easier for other code to # reason about the shape of the data diff --git a/rlkit/samplers/data_collector/path_collector.py b/rlkit/samplers/data_collector/path_collector.py index bec9ea006..4bf87b0b1 100644 --- a/rlkit/samplers/data_collector/path_collector.py +++ b/rlkit/samplers/data_collector/path_collector.py @@ -44,6 +44,7 @@ def collect_new_paths( self._policy, max_path_length=max_path_length_this_loop, ) +# print ("path: ", path["observations"]) path_len = len(path['actions']) if ( path_len != max_path_length From a2f96bb273e72f5f35c157ec4ab3dcc751d5718a Mon Sep 17 00:00:00 2001 From: Neo-X Date: Wed, 17 Feb 2021 16:43:08 -0800 Subject: [PATCH 07/18] Adding ppo --- .../policy_gradient/categorical_mlp_policy.py | 71 ++++++ .../continuous_bernoulli_mlp_policy.py | 110 +++++++++ rlkit/torch/policy_gradient/gaussian_mlp.py | 28 +++ .../policy_gradient/gaussian_mlp_policy.py | 55 +++++ .../gaussian_mlp_policy_fixed_var.py | 77 +++++++ rlkit/torch/policy_gradient/parameter.py | 22 ++ rlkit/torch/policy_gradient/ppo_algorithm.py | 46 ++++ rlkit/torch/policy_gradient/ppo_trainer.py | 192 ++++++++++++++++ rlkit/torch/policy_gradient/ppo_trainer_v2.py | 217 ++++++++++++++++++ rlkit/torch/policy_gradient/reference_ppo.py | 112 +++++++++ rlkit/torch/policy_gradient/vpg_trainer.py | 67 ++++++ 11 files changed, 997 insertions(+) create mode 100644 rlkit/torch/policy_gradient/categorical_mlp_policy.py create mode 100644 rlkit/torch/policy_gradient/continuous_bernoulli_mlp_policy.py create mode 100644 rlkit/torch/policy_gradient/gaussian_mlp.py create mode 100644 rlkit/torch/policy_gradient/gaussian_mlp_policy.py create mode 100644 rlkit/torch/policy_gradient/gaussian_mlp_policy_fixed_var.py create mode 100644 rlkit/torch/policy_gradient/parameter.py create mode 100644 rlkit/torch/policy_gradient/ppo_algorithm.py create mode 100644 rlkit/torch/policy_gradient/ppo_trainer.py create mode 100644 rlkit/torch/policy_gradient/ppo_trainer_v2.py create mode 100644 rlkit/torch/policy_gradient/reference_ppo.py create mode 100644 rlkit/torch/policy_gradient/vpg_trainer.py diff --git a/rlkit/torch/policy_gradient/categorical_mlp_policy.py b/rlkit/torch/policy_gradient/categorical_mlp_policy.py new file mode 100644 index 000000000..f666db06d --- /dev/null +++ b/rlkit/torch/policy_gradient/categorical_mlp_policy.py @@ -0,0 +1,71 @@ +import torch +import numpy as np +import pdb + +from railrl.torch.core import PyTorchModule +from railrl.policies.base import Policy +import railrl.torch.pytorch_util as ptu +import torch.distributions as tdist + +class CategoricalMlpPolicy(Policy, PyTorchModule): + def __init__(self, action_network, return_onehot=True): + super().__init__() + self.action_network = action_network + self.return_onehot = return_onehot + if not return_onehot: + raise NotImplementedError("not validated with cartpole") + self.action_dim = self.action_network._modules['last_fc'].out_features + + def log_prob(self, one_hot_actions, probs, none): + """ + + :param one_hot_actions: (B, A) one-hot actions + :param probs: (B, A) per-action probabilities + :returns: + :rtype: + + """ + assert(probs.shape[-1] == self.action_dim) + assert(one_hot_actions.shape[-1] == self.action_dim) + # Replay buffer stores discrete actions as onehots + return torch.log(probs[torch.arange(one_hot_actions.shape[0]), one_hot_actions.argmax(1)]) + + def get_action(self, observation, argmax=False): + action_dist = self.forward(ptu.from_numpy(observation)) + action_idx = self.rsample(*action_dist) + if argmax: action_idx[0, 0] = torch.argmax(action_dist[0]) + action_onehot = ptu.zeros(action_dist[0].shape, dtype=torch.int64) + action_onehot[0, action_idx[0, 0]] = 1 + action_log_prob = self.log_prob(action_onehot, *action_dist) + agent_info = dict(action_log_prob=ptu.get_numpy(action_log_prob), action_dist=ptu.get_numpy(action_dist[0])) + if self.return_onehot: + return ptu.get_numpy(action_onehot).flatten().tolist(), agent_info + else: + return ptu.get_numpy(action_idx).ravel().item(), agent_info + + def entropy(self, probs, none): + return - (probs * torch.log(probs)).sum(-1) + + def rsample(self, probs, none): + s = tdist.Categorical(probs, validate_args=True).sample((1,)) + return s + + def forward(self, input): + if len(input.shape) == 1: + action_probs = self.action_network(input.view(1, -1)) + else: + action_probs = self.action_network(input) + return (action_probs, None) + + def kl(self, source_probs, dest_probs): + source_log_probs = torch.log(source_probs) + dest_log_probs = torch.log(dest_probs) + assert(source_probs.shape[-1] == self.action_dim) + assert(dest_probs.shape[-1] == self.action_dim) + + # These must be true for discrete action spaces. + assert(0 <= source_probs.min() <= source_probs.max() <= 1) + assert(0 <= dest_probs.min() <= dest_probs.max() <= 1) + kl = (source_probs * (source_log_probs - dest_log_probs)).sum(-1) + assert(ptu.get_numpy(kl.min()) >= -1e-5) + return kl diff --git a/rlkit/torch/policy_gradient/continuous_bernoulli_mlp_policy.py b/rlkit/torch/policy_gradient/continuous_bernoulli_mlp_policy.py new file mode 100644 index 000000000..2656e2c74 --- /dev/null +++ b/rlkit/torch/policy_gradient/continuous_bernoulli_mlp_policy.py @@ -0,0 +1,110 @@ + +import numpy as np +import torch +import pdb + +import railrl.torch.core as core +import railrl.policies.base as base +import railrl.misc.class_util as classu +import railrl.torch.pytorch_util as ptu + +class ContinuousBernoulliMlpPolicy(base.Policy, core.PyTorchModule): + def __init__(self, mlp, action_space=None, scale=None, shift=None): + """ + + policy whose actions are in [scale*0 + shift, scale*1 + shift] == [shift, scale + shift] + + It uses a continuous bernoulli distribution, valid for inputs [0, 1], and transforms actions into that space first. + + :param mlp: network that outputs vectors in R^n + :param action_space: action space + :param scale: multiplicative scaling, before shift. zero-scale transformations are turned into 1s, silently. + :param shift: additive shift, after scaling + :returns: + :rtype: + + """ + super().__init__() + + self.mlp = mlp + + if action_space is not None: + self.shift = ptu.from_numpy(action_space.low) + self.scale = ptu.from_numpy(action_space.high - action_space.low) + elif scale is not None and shift is not None: + self.scale = ptu.from_numpy(scale) + self.shift = ptu.from_numpy(shift) + else: + raise ValueError(f"{self.__class__.__name__}.__init__() fail") + + # Replace zero-scale transformations with 1s. + self.scale[torch.eq(self.scale, 0.0)] = 1. + + def _action_space_to_event_space(self, x): + """ + + :param x: values in [shift, scale + shift] + :returns: values in [0, 1] + :rtype: + + """ + + self._validate_action_space(x) + return (x - self.shift) / self.scale + + def _event_space_to_action_space(self, x): + self._validate_event_space(x) + return self.scale * x + self.shift + + def _validate_action_space(self, x): + # Equivalent to checking that it's in the action space) + assert(torch.ge(x, self.shift).all() and torch.le(x, self.scale + self.shift).all()) + + def _validate_event_space(self, x): + assert(torch.ge(x, 0).all() and torch.le(x, 1).all()) + + def _dist(self, logits): + # TODO remove validation + return torch.distributions.continuous_bernoulli.ContinuousBernoulli(logits=logits, validate_args=True) + + def log_prob(self, x, logits=None, dist=None, is_event_space=False, **kwargs): + if dist is None: dist = self._dist(logits) + if not is_event_space: x = self._action_space_to_event_space(x) + # Uses naive-bayes assumption for multidimensional input + action_log_prob = dist.log_prob(x).sum(axis=-1) + return action_log_prob + + def entropy(self, logits, **kwargs): + # Uses naive-bayes assumption for multidimensional input + # TODO this is haphazard, I haven't checked if this is mathematically correct. + return self._dist(logits).entropy().sum(axis=-1) + + def get_action(self, observation, use_sample=True): + dist = self._dist(self.forward(ptu.from_numpy(observation))[0]) + # Mean + # action_event_space = dist.mean + + # Mode + # action_event_space = torch.round(dist.mean) + + # This is probably preferred for exploration purposes. + if use_sample: + action_event_space = dist.sample() + else: + raise NotImplementedError + action_event_space = dist.mean + + action_log_prob = self.log_prob(action_event_space, dist=dist, is_event_space=True) + agent_info = dict(action_log_prob=ptu.get_numpy(action_log_prob)) + action_action_space = self._event_space_to_action_space(action_event_space) + return ptu.get_numpy(action_action_space), agent_info + + def forward(self, input): + """ + + :param input: + :returns: logits + :rtype: len-1 tuple + + """ + return (self.mlp(input),) diff --git a/rlkit/torch/policy_gradient/gaussian_mlp.py b/rlkit/torch/policy_gradient/gaussian_mlp.py new file mode 100644 index 000000000..ffde4dce7 --- /dev/null +++ b/rlkit/torch/policy_gradient/gaussian_mlp.py @@ -0,0 +1,28 @@ +import torch +from railrl.torch.core import PyTorchModule +import numpy as np + +class GaussianMlp(PyTorchModule): + def __init__(self, + mean_mlp, + log_var_mlp, + min_variance=1e-3, + ): + super().__init__() + if min_variance is None: + self.log_min_variance = None + else: + self.log_min_variance = float(np.log(min_variance)) + + self.mean_mlp = mean_mlp + self.log_var_mlp = log_var_mlp + + def forward(self, input): + + mean = self.mean_mlp(input) + logvar = self.log_var_mlp(input) + + if self.log_min_variance is not None: + logvar = self.log_min_variance + torch.abs(logvar) + + return (mean, logvar) \ No newline at end of file diff --git a/rlkit/torch/policy_gradient/gaussian_mlp_policy.py b/rlkit/torch/policy_gradient/gaussian_mlp_policy.py new file mode 100644 index 000000000..3fc89048f --- /dev/null +++ b/rlkit/torch/policy_gradient/gaussian_mlp_policy.py @@ -0,0 +1,55 @@ +import torch +from railrl.torch.core import PyTorchModule +import numpy as np +from railrl.policies.base import Policy +import railrl.torch.pytorch_util as ptu +import torch.distributions as tdist + +class GaussianMlpPolicy(Policy, PyTorchModule): + def __init__(self, + mean_mlp, + log_var_mlp, + min_variance=1e-6, + ): + super().__init__() + if min_variance is None: + self.log_min_variance = None + else: + #self.log_min_variance = float(np.log(min_variance)) # + self.log_min_variance = ptu.from_numpy(np.log(np.array([min_variance]))) + + self.mean_mlp = mean_mlp + self.log_var_mlp = log_var_mlp + + def rsample(self, mean, log_var): + return mean + torch.randn(mean.shape, device=ptu.device) * torch.exp(log_var) + + def log_prob(self, x, mean, log_var): + # Assumes x is batch size by feature dim + # Returns log_likehood for each point in batch + zs = (x - mean) / torch.exp(log_var) + + dim = x.shape[0] if len(x.shape) == 1 else x.shape[-1] + return -torch.sum(log_var, -1) - \ + 0.5 * torch.sum(torch.pow(zs, 2), -1) -\ + 0.5 * dim * np.log(2*np.pi) + + def entropy(self, mean, log_var): + return torch.sum(log_var + np.log(np.sqrt(2 * np.pi * np.e)), -1) + + def get_action(self, observation): + action_params = self.forward(ptu.from_numpy(observation)) + action = self.rsample(*action_params) + + action_log_prob = self.log_prob(action, *action_params ) + agent_info = dict(action_log_prob=ptu.get_numpy(action_log_prob)) + return ptu.get_numpy(action), agent_info + + def forward(self, input): + #import ipdb; ipdb.set_trace() + mean = self.mean_mlp(input) + logvar = self.log_var_mlp(input) + if self.log_min_variance is not None: + logvar = torch.max(self.log_min_variance, logvar) + #logvar = self.log_min_variance + torch.abs(logvar) + return (mean, logvar) diff --git a/rlkit/torch/policy_gradient/gaussian_mlp_policy_fixed_var.py b/rlkit/torch/policy_gradient/gaussian_mlp_policy_fixed_var.py new file mode 100644 index 000000000..be267a2bc --- /dev/null +++ b/rlkit/torch/policy_gradient/gaussian_mlp_policy_fixed_var.py @@ -0,0 +1,77 @@ +import torch +import numpy as np +import pdb + +from railrl.torch.core import PyTorchModule +from railrl.policies.base import Policy +import railrl.torch.pytorch_util as ptu +import torch.distributions as tdist + +class GaussianMlpPolicyFixedVar(Policy, PyTorchModule): + def __init__(self, + mean_mlp, + log_var_mlp=0.2, + min_variance=1e-6, + ): + super().__init__() + if min_variance is None: + self.log_min_variance = None + else: + #self.log_min_variance = float(np.log(min_variance)) # + self.log_min_variance = ptu.from_numpy(np.log(np.array([min_variance]))) + + self.mean_mlp = mean_mlp + self.log_var_mlp = torch.tensor(log_var_mlp, requires_grad=False, device=ptu.device, dtype=torch.float32) + + def _rsample(self, mean, log_var): + return mean + torch.randn(mean.shape, device=ptu.device) * torch.exp(log_var) + + def log_prob(self, x, mean, log_var): + """Compute the log probability of an observation, parameterized by the mean and log(stddev**2) of a gaussian + + :param x: observation + :param mean: The mean of the gaussian. + :param log_var: The log(stddev**2) of the gaussian. + :returns: + :rtype: + + """ + # Assumes x is batch size by feature dim + # Returns log_likehood for each point in batch + zs = (x - mean) / torch.exp(log_var) + + dim = x.shape[0] if len(x.shape) == 1 else x.shape[-1] + return -torch.sum(log_var, -1) - \ + 0.5 * torch.sum(torch.pow(zs, 2), -1) -\ + 0.5 * dim * np.log(2*np.pi) + + def entropy(self, mean, log_var): + return torch.sum(log_var + np.log(np.sqrt(2 * np.pi * np.e)), -1) + + def get_action(self, observation, **kwargs): + # (mean, log var) + action_params = self.forward(ptu.from_numpy_or_pytorch(observation)) + + # The argmax is the mean. + if 'argmax' in kwargs and kwargs['argmax']: + action = action_params[0] + else: + action = self._rsample(*action_params) + + action_log_prob = self.log_prob(action, *action_params) + agent_info = dict(action_log_prob=ptu.get_numpy(action_log_prob)) + return ptu.get_numpy(action), agent_info + + def forward(self, input): + """Predicts mean and log(stddev**2) of a diagonal gaussian + + :param input: + :returns: + :rtype: + + """ + + #import ipdb; ipdb.set_trace() + mean = self.mean_mlp(input) + logvar = self.log_var_mlp + return (mean, logvar) diff --git a/rlkit/torch/policy_gradient/parameter.py b/rlkit/torch/policy_gradient/parameter.py new file mode 100644 index 000000000..ff8305913 --- /dev/null +++ b/rlkit/torch/policy_gradient/parameter.py @@ -0,0 +1,22 @@ +import numpy as np +import torch.nn as nn +import railrl.torch.pytorch_util as ptu +import torch + +class Parameter(nn.Module): + def __init__(self, input_dim, output_dim, init): + super(Parameter, self).__init__() + self.output_dim = output_dim + self.init = init + self.param_init = ptu.from_numpy(np.zeros((output_dim)) + init).float() + #TODO: fix this nn.Parameter(self.param_init) + self.params_var = nn.Parameter(self.param_init) + + def forward(self, x): + + if len(x.shape) == 1: + zeros = ptu.zeros(self.output_dim) + return zeros + self.params_var + else: + zeros = ptu.zeros((x.shape[0], self.output_dim)) + return zeros + self.params_var.view(1, -1) \ No newline at end of file diff --git a/rlkit/torch/policy_gradient/ppo_algorithm.py b/rlkit/torch/policy_gradient/ppo_algorithm.py new file mode 100644 index 000000000..9a2909ca3 --- /dev/null +++ b/rlkit/torch/policy_gradient/ppo_algorithm.py @@ -0,0 +1,46 @@ + + +from railrl.torch.torch_rl_algorithm import TorchBatchRLAlgorithm +from torch.utils.tensorboard import SummaryWriter +from railrl.core import logger + + +class PPOAlgorithm(TorchBatchRLAlgorithm): + + def __init__( + self, *args, + **kwargs + ): + + super().__init__(*args, **kwargs) + # Create a tensorboard writer to log things. + self.writer = SummaryWriter(logger.get_snapshot_dir() + '/tensorboard') + + def _train(self): + + out = super()._train() + + if ((self.epoch % 5) == 0): + self.render_video_and_add_to_tensorboard("eval_video") + + return out + + def render_video_and_add_to_tensorboard(self, tag): + import numpy as np + import pdb +# log.debug("{}".format("render_video_and_add_to_tensorboard")) + + path = self.eval_data_collector.collect_new_paths( + self.max_path_length, + self.num_eval_steps_per_epoch, + discard_incomplete_paths=True, + ) +# pdb.set_trace() +# timer.stamp('evaluation sampling') + +# path = self.exploration_data_collectors.model_exploration_collector.collect_rendered_rollout(self.max_path_length) +# video = np.stack(path['rendering'], axis=0) +# pdb.set_trace() + video = np.array([ x['rendering'] for x in path[0]['env_infos']]) + video = np.moveaxis(video, -1, -3)[None] + self.writer.add_video(tag, video, self.epoch, fps=8) \ No newline at end of file diff --git a/rlkit/torch/policy_gradient/ppo_trainer.py b/rlkit/torch/policy_gradient/ppo_trainer.py new file mode 100644 index 000000000..00ebe9640 --- /dev/null +++ b/rlkit/torch/policy_gradient/ppo_trainer.py @@ -0,0 +1,192 @@ + +import logging +import os +import torch.optim as optim + +import railrl.torch.pytorch_util as ptu +import railrl.core.pylogging +from railrl.torch.torch_rl_algorithm import TorchTrainer +import torch + +log = logging.getLogger(os.path.basename(__name__)) + +class PPOTrainer(TorchTrainer): + """ + Proximal Policy Optimization + """ + + def __init__( + self, + policy, + policy_learning_rate, + value_f, + replay_buffer, + clip_param=0.2, + entropy_bonus=0, + env_info_observation_key='', + discount_factor=0.99, + et_factor=0.2, + allow_large_updates=False, + debug=False + ): + super().__init__() + self.policy = policy +# self.target_policy = self.policy.copy() + self.value_f = value_f + self.policy_learning_rate = policy_learning_rate + self.optimizer_v = optim.Adam( list(self.value_f.parameters()), + lr=self.policy_learning_rate* 10) + + self.optimizer_p = optim.Adam(list(self.policy.parameters()), + lr=self.policy_learning_rate) + self.eval_statistics = {} + self.need_to_update_eval_statistics = True + self.replay_buffer = replay_buffer + self.clip_param = clip_param + self.entropy_bonus = entropy_bonus + self.env_info_observation_key = env_info_observation_key + self.debug = debug + self.discount_factor = discount_factor + + self.et_factor = et_factor + self.allow_large_updates = allow_large_updates + self.initial_policy_state = self.policy.state_dict() + self.initial_optimizer_states = [self.optimizer_v.state_dict(), self.optimizer_p.state_dict()] + + def reset_optimizers(self): + self.optimizer_v.load_state_dict(self.initial_optimizer_states[0]) + self.optimizer_p.load_state_dict(self.initial_optimizer_states[1]) + + def reset_policy(self): + self.policy.load_state_dict(self.initial_policy_state) + + def end_epoch(self, epoch): + self.replay_buffer.empty_buffer() + + def save_policy_and_optimizer(self, output_directory, basename): + """Saves the policy and its optimizer to the output directory with the given basename""" + policy_fn = os.path.join(output_directory, basename + '.pt') + optim_fn = os.path.join(output_directory, basename + '_optim.pt') + policy_state = dict(policy=self.policy.state_dict(), value_f=self.value_f.state_dict()) + optim_state = dict(optimizer_v=self.optimizer_v.state_dict(), optimizer_p=self.optimizer_p.state_dict()) + torch.save(policy_state, policy_fn) + torch.save(optim_state, optim_fn) + + def load_policy_and_optimizer(self, policy_params_filename): + """Loads the policy params and the optimizer params, using the path to the serialized policy parameters + + :param policy_params_filename: filename of the policy params + :returns: + :rtype: + + """ + log.info("Loading policy and optimizer. Policy filename: {}".format(policy_params_filename)) + policy_params_filename = os.path.realpath(policy_params_filename) + assert(os.path.isfile(policy_params_filename)) + # /foo/bar/policy.pt -> '/foo/bar/policy', '.pt' + policy_name, policy_ext = os.path.splitext(policy_params_filename) + # '/foo/bar/' + directory = os.path.dirname(policy_name) + # 'policy' + basename = os.path.basename(policy_name) + policy_fn = os.path.join(directory, basename + policy_ext) + # Sanity check. + assert(policy_fn == policy_params_filename) + optim_fn = os.path.join(directory, basename + '_optim' + policy_ext) + if not os.path.isfile(optim_fn): + raise FileNotFoundError("Policy filename {filename} not found".format(filename=optim_fn)) + + policy_state = torch.load(policy_fn) + optim_state = torch.load(optim_fn) + self.policy.load_state_dict(policy_state['policy']) + self.value_f.load_state_dict(policy_state['value_f']) + self.optimizer_v.load_state_dict(optim_state['optimizer_v']) + self.optimizer_p.load_state_dict(optim_state['optimizer_p']) + + def train_from_torch(self, batch): + # Batch is whole replay buffer in this case + if self.env_info_observation_key != '': + obs = batch[self.env_info_observation_key] + log.trace("Using alternate obs: {}".format(self.env_info_observation_key)) + else: + obs = batch['observations'] + + actions = batch['actions'] + returns = batch['returns'] + old_action_log_probs = batch['action_log_prob'] + adv_targ = batch['advs'] + adv_targ = (adv_targ - torch.mean(adv_targ)) / torch.std(adv_targ) + assert(torch.isfinite(adv_targ).all()) + +# pdb.set_trace() + """ + Policy operations. + """ + + action_dist_params = self.policy.forward(obs) + action_log_probs = self.policy.log_prob(actions, *action_dist_params).view(-1, 1) + + ratio = torch.exp(action_log_probs - old_action_log_probs) + surr1 = ratio * adv_targ + surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) * adv_targ + # PPO's pessimistic surrogate (L^CLIP) + action_loss = -torch.min(surr1, surr2).mean() + + dist_entropy = self.policy.entropy(*action_dist_params).mean() +# total_loss = (value_loss + action_loss - dist_entropy * self.entropy_bonus) +# total_loss = (value_loss + action_loss) + total_loss = (action_loss) + + """ + Update Networks + """ + from torch.utils.data import Dataset, TensorDataset, DataLoader + dataset = TensorDataset(obs, returns) + train_loader = DataLoader(dataset=dataset, batch_size=64) + for obs_batch, return_batch in train_loader: + values = self.value_f.forward(obs_batch) + ### I like to scale this be the inverse discount factor + value_loss = 0.5*(((return_batch) - values) * ((1-self.discount_factor)/1.0)).pow(2).mean() + self.optimizer_v.zero_grad() + if self.debug: + # Slows things down. + with torch.autograd.detect_anomaly(): + value_loss.backward() + else: + value_loss.backward() + self.optimizer_v.step() + + r_ = 1 - torch.mean(ratio) + et_factor = 0.2 + small_update = (r_ < (et_factor)) and ( r_ > (-et_factor)) + + if small_update or self.allow_large_updates: ### update not to large ) + self.optimizer_p.zero_grad() + if self.debug: + # Slows things down. + with torch.autograd.detect_anomaly(): + action_loss.backward() + else: + action_loss.backward() + self.optimizer_p.step() + else: + log.warning(f"Not taking large update with factor {r_:.2f}") + + self.eval_statistics['ppo_action_loss'] = ptu.get_numpy(action_loss) + self.eval_statistics['ppo_value_loss'] = ptu.get_numpy(value_loss) + self.eval_statistics['ppo_total_loss'] = ptu.get_numpy(total_loss) + self.eval_statistics['ppo_action_entropy'] = ptu.get_numpy(dist_entropy) + self.eval_statistics['ppo_importance_ratio'] = ptu.get_numpy(r_) + + def get_diagnostics(self): + return self.eval_statistics + + @property + def networks(self): + return [ + self.policy, + self.value_f + ] + + def get_snapshot(self): + return dict(policy=self.policy.state_dict(), baseline=self.value_f.state_dict()) diff --git a/rlkit/torch/policy_gradient/ppo_trainer_v2.py b/rlkit/torch/policy_gradient/ppo_trainer_v2.py new file mode 100644 index 000000000..2b3381d1c --- /dev/null +++ b/rlkit/torch/policy_gradient/ppo_trainer_v2.py @@ -0,0 +1,217 @@ + +import logging +import os +import torch.optim as optim +import torch.nn +import numpy as np + +import railrl.torch.pytorch_util as ptu +import railrl.core.pylogging +from railrl.torch.torch_rl_algorithm import TorchTrainer +import torch +import pdb +log = logging.getLogger(os.path.basename(__name__)) + +class PPOTrainerV2(TorchTrainer): + """ + Proximal Policy Optimization + """ + + def __init__( + self, + policy, + policy_learning_rate, + value_learning_rate, + value_f, + replay_buffer, + clip_param=0.2, + env_info_observation_key='', + vf_loss_coeff=1.0, + entropy_coeff=0.02, + vf_clip_param=0.1, + kl_coeff=0.2, + allow_large_updates=False, + debug=False, + reset_replay_buffer=True, + **kwargs + ): + super().__init__() + self.policy = policy + self.value_f = value_f + self.policy_learning_rate = policy_learning_rate + self.parameters_list = list(self.value_f.parameters()) + list(self.policy.parameters()) + self.policy_params = list(self.policy.parameters()) + self.value_params = list(self.value_f.parameters()) + self.value_lr = value_learning_rate + self.policy_lr = policy_learning_rate + self.optimizer_p = optim.Adam(self.policy_params, lr=self.policy_lr) + self.optimizer_v = optim.Adam(self.value_params, lr=self.value_lr) + self.eval_statistics = {} + self.need_to_update_eval_statistics = True + self.replay_buffer = replay_buffer + self.clip_param = clip_param + self.env_info_observation_key = env_info_observation_key + self.debug = debug + + self.vf_loss_coeff = vf_loss_coeff + self.entropy_coeff = entropy_coeff + self.vf_clip_param = vf_clip_param + self.kl_coeff = kl_coeff + self.grad_clip_norm = 1000 + + self.allow_large_updates = allow_large_updates + self.initial_policy_state = self.policy.state_dict() + self.initial_optimizer_state = self.optimizer_p.state_dict() + self.reset_replay_buffer = reset_replay_buffer + + def __repr__(self): + return (f"{self.__class__.__name__}(vf_loss_coeff={self.vf_loss_coeff:.3f}, " + + f"value_lr={self.value_lr:.2g}, " + + f"policy_lr={self.policy_lr:.2g}, " + + f"kl_coeff={self.kl_coeff:.3f}, " + + f"entropy_coeff={self.entropy_coeff:.3f}, " + + f"vf_clip_param={self.vf_clip_param:.3f})") + + def reset_optimizers(self): + self.optimizer.load_state_dict(self.initial_optimizer_state) + + def reset_policy(self): + self.policy.load_state_dict(self.initial_policy_state) + + def end_epoch(self, epoch): + if self.reset_replay_buffer: + log.info("Resetting policy replay buffer") + self.replay_buffer.empty_buffer() + else: + log.info("Not resetting policy replay buffer") + + def save_policy_and_optimizer(self, output_directory, basename): + """Saves the policy and its optimizer to the output directory with the given basename""" + policy_fn = os.path.join(output_directory, basename + '.pt') + optim_fn = os.path.join(output_directory, basename + '_optim.pt') + policy_state = dict(policy=self.policy.state_dict(), value_f=self.value_f.state_dict()) + optim_state = dict(optimizer=self.optimizer.state_dict()) + torch.save(policy_state, policy_fn) + torch.save(optim_state, optim_fn) + + def load_policy_and_optimizer(self, policy_params_filename): + """Loads the policy params and the optimizer params, using the path to the serialized policy parameters + + :param policy_params_filename: filename of the policy params + :returns: + :rtype: + + """ + log.info("Loading policy and optimizer. Policy filename: {}".format(policy_params_filename)) + policy_params_filename = os.path.realpath(policy_params_filename) + assert(os.path.isfile(policy_params_filename)) + # /foo/bar/policy.pt -> '/foo/bar/policy', '.pt' + policy_name, policy_ext = os.path.splitext(policy_params_filename) + # '/foo/bar/' + directory = os.path.dirname(policy_name) + # 'policy' + basename = os.path.basename(policy_name) + policy_fn = os.path.join(directory, basename + policy_ext) + # Sanity check. + assert(policy_fn == policy_params_filename) + optim_fn = os.path.join(directory, basename + '_optim' + policy_ext) + if not os.path.isfile(optim_fn): + raise FileNotFoundError("Policy filename {filename} not found".format(filename=optim_fn)) + + policy_state = torch.load(policy_fn) + optim_state = torch.load(optim_fn) + self.policy.load_state_dict(policy_state['policy']) + self.value_f.load_state_dict(policy_state['value_f']) + self.optimizer.load_state_dict(optim_state['optimizer']) + + def train_from_torch(self, batch): + # Batch is whole replay buffer in this case + if self.env_info_observation_key != '': + obs = batch[self.env_info_observation_key] + log.trace("Using alternate obs: {}".format(self.env_info_observation_key)) + next_obs = batch['next_' + self.env_info_observation_key] + else: + obs = batch['observations'] + next_obs = batch['next_observations'] + + actions = batch['actions'] + value_targets = batch['returns'] + previous_value_predictions = batch['vf_preds'] + old_action_log_probs = batch['action_log_prob'] + adv_targ = batch['advs'] + + assert(torch.isfinite(adv_targ).all()) + action_dist = self.policy.forward(obs) + action_log_probs = self.policy.log_prob(actions, *action_dist).view(-1, 1) + value_fn = self.value_f.forward(obs) + + # TOOD hardcoded discount + # TODO doesn't account for terminal states. + # td_target = batch['rewards'] + 0.95 * previous_value_predictions + # td1_error = torch.pow(td_target.detach() - value_fn, 2.) + + # Clamp extremely large values in log space before exponentiating (i.e. the ratio p/q has exploded) + ratio = torch.exp(torch.clamp(action_log_probs - old_action_log_probs, np.log(-1e8), np.log(1e5))) + surr1 = ratio * adv_targ + surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) * adv_targ + ratio_mean = ptu.get_numpy(ratio).mean() + + # PPO's pessimistic surrogate (L^CLIP) + surrogate_loss = torch.min(surr1, surr2).mean() + + dist_entropy = self.policy.entropy(*action_dist).mean() + if not np.isclose(self.kl_coeff, 0.0): + old_action_dist = batch['action_dist'] + action_kl = torch.max(self.policy.kl(source_probs=old_action_dist, dest_probs=action_dist).mean(), + torch.zeros((), device=ptu.device)) + else: + action_kl = torch.zeros(()) + + vf_loss1 = torch.pow(value_fn - value_targets, 2.0) + # NB vf_clip_param should depend on reward scaling. + vf_clipped = previous_value_predictions + torch.clamp(value_fn - previous_value_predictions, -self.vf_clip_param, self.vf_clip_param) + vf_loss2 = torch.pow(vf_clipped - value_targets, 2.0) + vf_loss = torch.max(vf_loss1, vf_loss2).mean() + # vf_loss = torch.clamp(vf_loss1.mean(), -self.vf_clip_param, self.vf_clip_param) + + # vf_loss = td1_error.mean() + # vf_loss = torch.pow(value_fn - value_targets, 2.0).mean() + + total_loss = (-1 * surrogate_loss + + # self.kl_coeff * action_kl + + # self.vf_loss_coeff * vf_loss + + -1 * self.entropy_coeff * dist_entropy) + + assert(torch.isfinite(total_loss)) + + # with torch.autograd.detect_anomaly(): + self.optimizer_p.zero_grad() + total_loss.backward() + torch.nn.utils.clip_grad_norm_(self.policy_params, self.grad_clip_norm, norm_type=2) + self.optimizer_p.step() + + self.optimizer_v.zero_grad() + vf_loss.backward() + torch.nn.utils.clip_grad_norm_(self.value_params, self.grad_clip_norm, norm_type=2) + self.optimizer_v.step() + + self.eval_statistics['ppo_action_loss'] = ptu.get_numpy(surrogate_loss) + self.eval_statistics['ppo_value_loss'] = ptu.get_numpy(vf_loss) + # self.eval_statistics['ppo_deltas_loss'] = ptu.get_numpy(deltas) + self.eval_statistics['ppo_total_loss'] = ptu.get_numpy(total_loss) + self.eval_statistics['ppo_action_entropy'] = ptu.get_numpy(dist_entropy) + self.eval_statistics['ppo_action_kl'] = ptu.get_numpy(action_kl) + self.eval_statistics['ppo_importance_ratio'] = ratio_mean + + def get_diagnostics(self): + return self.eval_statistics + + @property + def networks(self): + return [ + self.policy, + self.value_f + ] + + def get_snapshot(self): + return dict(policy=self.policy.state_dict(), baseline=self.value_f.state_dict()) diff --git a/rlkit/torch/policy_gradient/reference_ppo.py b/rlkit/torch/policy_gradient/reference_ppo.py new file mode 100644 index 000000000..f88c947b0 --- /dev/null +++ b/rlkit/torch/policy_gradient/reference_ppo.py @@ -0,0 +1,112 @@ +import copy +from collections import OrderedDict + +import numpy as np +import torch +import torch.optim as optim +from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler +from irl.algos.batch_polopt import BatchPolopt +from irl.utils.torch_utils import from_numpy, Variable, get_numpy, LongTensor +import irl.utils.logger as logger + +class PPO(BatchPolopt): + def __init__( + self, + env, + env_name, + policy, + baseline, + ppo_batch_size=64, + epoch=10, + clip_param=0.2, + step_size=3e-4, + optimizer=None, + reset_args=None, + **kwargs): + if optimizer is None: + optimizer = optim.Adam(policy.get_params() + list(baseline.network.parameters()), lr=step_size) + self.ppo_batch_size = ppo_batch_size + self.optimizer = optimizer + self.epoch = epoch + self.clip_param = clip_param + self.reset_args = reset_args + + super(PPO, self).__init__(env=env, env_name=env_name, policy=policy, baseline=baseline, **kwargs) + self.fit_baseline = False # Don't do it during process samples. Will happen during optimize + + def obtain_samples(self, itr): + with torch.no_grad(): + if self.reset_args is not None: + return self.sampler.obtain_samples(self.batch_size, self.max_path_length, volatile=True, reset_args=self.reset_args) + return self.sampler.obtain_samples(self.batch_size, self.max_path_length, volatile=True) + + def optimize_policy(self, itr, samples_data, add_input_fn=None, add_input_input=None, add_loss_fn=None, print=True): + + advantages = from_numpy(samples_data['discount_adv'].astype(np.float32)) + n_traj = samples_data['obs'].shape[0] + n_obs = n_traj * self.max_path_length + #add_input_obs = from_numpy(samples_data['obs'][:, :, :self.obs_dim].astype(np.float32)).view(n_traj, -1) + if add_input_fn is not None: + obs = from_numpy(samples_data['obs'][:, :self.max_path_length, :self.obs_dim].astype(np.float32)).view(n_obs, -1) + else: + obs = from_numpy(samples_data['obs'][:, :self.max_path_length, :].astype(np.float32)).view(n_obs, -1) + + #obs = from_numpy(samples_data['obs'][:, :self.max_path_length, :].astype(np.float32)).view(n_obs, -1) + + actions = samples_data['actions'].view(n_obs, -1).data + returns = from_numpy(samples_data['discount_returns'].copy()).view(-1, 1).float() + old_action_log_probs = samples_data['log_prob'].view(n_obs, -1).data + states = samples_data['states'].view(samples_data['states'].size()[0], n_obs, -1) if self.policy.recurrent else None + + for epoch_itr in range(self.epoch): + sampler = BatchSampler(SubsetRandomSampler(range(n_obs)), + self.ppo_batch_size, drop_last=False) + for indices in sampler: + indices = LongTensor(indices) + obs_batch = Variable(obs[indices]) + actions_batch = actions[indices] + return_batch = returns[indices] + old_action_log_probs_batch = old_action_log_probs[indices] + if states is not None: + self.policy.set_state(Variable(states[:, indices])) + + if add_input_fn is not None: + add_input_dist = add_input_fn(Variable(add_input_input)) + add_input = add_input_dist.sample() + add_input_rep = torch.unsqueeze(add_input, 1).repeat(1, self.max_path_length, 1).view(n_obs, -1) + #add_input_batch = add_input[indices/add_input.size()[0]] + add_input_batch = add_input_rep[indices] + obs_batch = torch.cat([obs_batch, add_input_batch], -1) + + + values = self.baseline.forward(obs_batch.detach()) + action_dist = self.policy.forward(obs_batch) + action_log_probs = action_dist.log_likelihood(Variable(actions_batch)).unsqueeze(-1) + dist_entropy = action_dist.entropy().mean() + + ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) + adv_targ = Variable(advantages.view(-1, 1)[indices]) + surr1 = ratio * adv_targ + surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) * adv_targ + action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) + + value_loss = (Variable(return_batch) - values).pow(2).mean() + + self.optimizer.zero_grad() + + total_loss = (value_loss + action_loss - dist_entropy * self.entropy_bonus) + if add_loss_fn is not None: + total_loss += add_loss_fn(add_input_dist, add_input, add_input_input) + total_loss.backward() + self.optimizer.step() + if print: + stats = {'total loss': get_numpy(total_loss)[0], + 'action loss': get_numpy(action_loss)[0], + 'value loss': get_numpy(value_loss)[0], + 'entropy' : get_numpy(dist_entropy)[0]} + with logger.prefix('Train PPO itr %d epoch itr %d | ' %(itr, epoch_itr)): + self.print_diagnostics(stats) + + + + return total_loss \ No newline at end of file diff --git a/rlkit/torch/policy_gradient/vpg_trainer.py b/rlkit/torch/policy_gradient/vpg_trainer.py new file mode 100644 index 000000000..26a0bcb4a --- /dev/null +++ b/rlkit/torch/policy_gradient/vpg_trainer.py @@ -0,0 +1,67 @@ +from collections import OrderedDict + +import numpy as np +import torch.optim as optim + +import railrl.torch.pytorch_util as ptu +from railrl.core import logger +from railrl.data_management.env_replay_buffer import VPGEnvReplayBuffer +from railrl.torch.core import np_to_pytorch_batch +from railrl.torch.distributions import TanhNormal +from railrl.torch.torch_rl_algorithm import TorchBatchRLAlgorithm, TorchTrainer +import torch + +class VPGTrainer(TorchTrainer): + """ + Vanilla Policy Gradient + """ + + def __init__( + self, + policy, + policy_learning_rate, + replay_buffer + ): + super().__init__() + self.policy = policy + self.policy_learning_rate = policy_learning_rate + self.policy_optimizer = optim.Adam(self.policy.parameters(), + lr=self.policy_learning_rate) + self.eval_statistics = {} + self.need_to_update_eval_statistics = True + self.replay_buffer = replay_buffer + + def end_epoch(self, epoch): + self.replay_buffer.empty_buffer() + + def train_from_torch(self, batch): + #batch = self.get_batch() + obs = batch['observations'] + actions = batch['actions'] + returns = batch['returns'] + """ + Policy operations. + """ + + _, means, _, _, _, stds,_, _ = self.policy.forward(obs,) + log_probs = TanhNormal(means, stds).log_prob(actions) + log_probs_times_returns = log_probs * returns + policy_loss = -1*torch.mean(log_probs_times_returns) + + + + """ + Update Networks + """ + + self.policy_optimizer.zero_grad() + policy_loss.backward() + self.policy_optimizer.step() + + self.eval_statistics['Policy Loss'] = ptu.get_numpy(policy_loss) + + @property + def networks(self): + return [ + self.policy, + ] From 2a3aa257dd6f2085ea5e49b980cc3703bee8382d Mon Sep 17 00:00:00 2001 From: Neo-X Date: Thu, 18 Feb 2021 20:02:35 -0800 Subject: [PATCH 08/18] Working on buffer --- rlkit/data_management/env_replay_buffer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rlkit/data_management/env_replay_buffer.py b/rlkit/data_management/env_replay_buffer.py index fb615bb33..7eee5b415 100644 --- a/rlkit/data_management/env_replay_buffer.py +++ b/rlkit/data_management/env_replay_buffer.py @@ -31,7 +31,8 @@ def __init__( max_replay_buffer_size=max_replay_buffer_size, observation_dim=get_dim(self._ob_space), action_dim=get_dim(self._action_space), - env_info_sizes=env_info_sizes + env_info_sizes=env_info_sizes, + dtype=dtype ) def add_sample(self, observation, action, reward, terminal, From c2de8111c1543500cc8d5c4cda7ed9e449f00dbe Mon Sep 17 00:00:00 2001 From: Neo-X Date: Thu, 18 Feb 2021 20:05:24 -0800 Subject: [PATCH 09/18] Working on ppo --- rlkit/data_management/env_replay_buffer.py | 148 ++++++++++++++++++ rlkit/torch/core.py | 8 + .../policy_gradient/categorical_mlp_policy.py | 6 +- .../continuous_bernoulli_mlp_policy.py | 8 +- rlkit/torch/policy_gradient/gaussian_mlp.py | 2 +- .../policy_gradient/gaussian_mlp_policy.py | 6 +- .../gaussian_mlp_policy_fixed_var.py | 6 +- rlkit/torch/policy_gradient/parameter.py | 2 +- rlkit/torch/policy_gradient/ppo_algorithm.py | 4 +- rlkit/torch/policy_gradient/ppo_trainer.py | 6 +- rlkit/torch/policy_gradient/ppo_trainer_v2.py | 6 +- rlkit/torch/policy_gradient/vpg_trainer.py | 10 +- 12 files changed, 184 insertions(+), 28 deletions(-) diff --git a/rlkit/data_management/env_replay_buffer.py b/rlkit/data_management/env_replay_buffer.py index fb615bb33..69c1fdb1e 100644 --- a/rlkit/data_management/env_replay_buffer.py +++ b/rlkit/data_management/env_replay_buffer.py @@ -49,3 +49,151 @@ def add_sample(self, observation, action, reward, terminal, terminal=terminal, **kwargs ) + + +class PPOEnvReplayBuffer(EnvReplayBuffer): + def __init__( + self, + max_replay_buffer_size, + env, + discount_factor, + value_f, + use_gae=False, + gae_discount=0.95, + **kwargs + ): + super().__init__(max_replay_buffer_size, env, **kwargs) + self._returns = np.zeros((max_replay_buffer_size, 1)) + self.current_trajectory_rewards = np.zeros((max_replay_buffer_size, 1)) + self._max_replay_buffer_size = max_replay_buffer_size + self.discount_factor = discount_factor + self.value_f = value_f + self.use_gae = use_gae + self.gae_discount = gae_discount + self._bottom = 0 + self._values = np.zeros((max_replay_buffer_size, 1)) + self._advs = np.zeros((max_replay_buffer_size, 1)) + + def discounted_rewards(self, rewards, discount_factor): + import scipy + from scipy import signal, misc + """ + computes discounted sums along 0th dimension of x. + inputs + ------ + rewards: ndarray + discount_factor: float + outputs + ------- + y: ndarray with same shape as x, satisfying + y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k], + where k = len(x) - t - 1 + """ + assert rewards.ndim >= 1 + return scipy.signal.lfilter([1],[1,-discount_factor],rewards[::-1], axis=0)[::-1] + + def terminate_episode(self): + returns = [] + observations = self._observations[self._bottom:self._top] + self._values[self._bottom:self._top] = ptu.get_numpy(self.value_f(ptu.from_numpy(observations))) + +# b1 = np.append(self._values[self._bottom:self._top], 0) + ### THe proper way to terminate the episode + b1 = np.append(self._values[self._bottom:self._top], 0 if self._terminals[self._top-1] else self._values[self._top-1]) +# b1 = np.append(self._values[self._bottom:self._top], self._values[self._top-1]) + b1 = np.reshape(b1, (-1,1)) + deltas = self._rewards[self._bottom:self._top] + self.discount_factor*b1[1:] - b1[:-1] + self._advs[self._bottom:self._top] = self.discounted_rewards(deltas, self.discount_factor * self.gae_discount) + self._returns[self._bottom:self._top] = self.discounted_rewards(self._rewards[self._bottom:self._top], self.discount_factor) + + self._bottom = self._top + + def add_sample(self, observation, action, reward, terminal, + next_observation, env_info=None, agent_info=None): + if self._top == self._max_replay_buffer_size: + raise EnvironmentError('Replay Buffer Overflow, please reduce the number of samples added!') + + # This could catch onehot vs. integer representation differences. + assert(self._actions.shape[-1] == action.size) + self._observations[self._top] = observation + self._actions[self._top] = action + self._rewards[self._top] = reward + self._terminals[self._top] = terminal + self._next_obs[self._top] = next_observation + + for key in self._env_info_keys: + self._env_infos[key][self._top] = env_info[key] + + for key in self._agent_info_keys: + self._agent_infos[key][self._top] = agent_info[key] + + self._advance() + + def add_paths(self, paths): + log.trace("Adding {} new paths. First path length: {}".format(len(paths), paths[0]['actions'].shape[0])) + for path in paths: + self.add_path(path) + # process samples after adding paths + self.process_samples(self.value_f) + + def process_samples(self, value_f): + # Compute value for all states + pass +# self._advs[:] = self._returns - self._values + + # Center adv +# advs = self._advs[:self._top] +# self._advs[:self._top] = (advs - advs.mean()) / (advs.std() + 1e-5) + + def random_batch(self, batch_size): + indices = np.random.randint(0, self._size, batch_size) + batch = dict( + observations=self._observations[indices], + actions=self._actions[indices], + rewards=self._rewards[indices], + terminals=self._terminals[indices], + next_observations=self._next_obs[indices], + returns=self._returns[indices], + advs=self._advs[indices], + vf_preds=self._values[indices] + ) + for key in self._env_info_keys: + assert key not in batch.keys() + batch[key] = self._env_infos[key][indices] + for key in self._agent_info_keys: + assert key not in batch.keys() + batch[key] = self._agent_infos[key][indices] + return batch + + def all_batch_windows(self, window_len, skip=1, return_env_info=False): + # Will return (bs, batch_len, dim) + + start_indices = np.arange(0, self._size - window_len, skip) + terminal_sums = [self._terminals[i0:i0+window_len].sum() for i0 in start_indices] + + # NB first mask should always be True for current start_indices. + valid_start_mask = np.logical_and(start_indices + window_len < self._size, np.equal(terminal_sums, 0)) + valid_start_indices = start_indices[valid_start_mask] + + batch = dict( + observations=np.stack([self._observations[i:i+window_len] for i in valid_start_indices]), + actions=np.stack([self._actions[i:i+window_len] for i in valid_start_indices]), + rewards=np.stack([self._rewards[i:i+window_len] for i in valid_start_indices]), + terminals=np.stack([self._terminals[i:i+window_len] for i in valid_start_indices]), + buffer_idx=valid_start_indices, + ) + if return_env_info: + env_info_batch = {} + for k, v in self._env_infos.items(): + env_info_batch[k] = np.stack([v[i:i+window_len] for i in valid_start_indices]) + batch.update(env_info_batch) + return batch + + def relabel_rewards(self, rewards): + # Ensure the updated rewards match the size of all of the data currently in the buffer. + assert(rewards.shape == (self._size,)) + # Assumes the rewards correspond to the most-recently-added data to the buffer. + # I'm pretty sure this assumption is valid, because otherwise self._size would have to + # be larger than rewards.shape[0]. + self._rewards[self._top - self._size:self._size] = rewards[:, None] + diff --git a/rlkit/torch/core.py b/rlkit/torch/core.py index a94a45578..ab44b9d32 100644 --- a/rlkit/torch/core.py +++ b/rlkit/torch/core.py @@ -1,8 +1,16 @@ +import abc import numpy as np import torch +from torch import nn as nn from rlkit.torch import pytorch_util as ptu +class PyTorchModule(nn.Module, metaclass=abc.ABCMeta): + """ + Keeping wrapper around to be a bit more future-proof. + """ + pass + def eval_np(module, *args, **kwargs): """ diff --git a/rlkit/torch/policy_gradient/categorical_mlp_policy.py b/rlkit/torch/policy_gradient/categorical_mlp_policy.py index f666db06d..9aba8c8f8 100644 --- a/rlkit/torch/policy_gradient/categorical_mlp_policy.py +++ b/rlkit/torch/policy_gradient/categorical_mlp_policy.py @@ -2,9 +2,9 @@ import numpy as np import pdb -from railrl.torch.core import PyTorchModule -from railrl.policies.base import Policy -import railrl.torch.pytorch_util as ptu +from rlkit.torch.core import PyTorchModule +from rlkit.policies.base import Policy +import rlkit.torch.pytorch_util as ptu import torch.distributions as tdist class CategoricalMlpPolicy(Policy, PyTorchModule): diff --git a/rlkit/torch/policy_gradient/continuous_bernoulli_mlp_policy.py b/rlkit/torch/policy_gradient/continuous_bernoulli_mlp_policy.py index 2656e2c74..df5c55ee3 100644 --- a/rlkit/torch/policy_gradient/continuous_bernoulli_mlp_policy.py +++ b/rlkit/torch/policy_gradient/continuous_bernoulli_mlp_policy.py @@ -3,10 +3,10 @@ import torch import pdb -import railrl.torch.core as core -import railrl.policies.base as base -import railrl.misc.class_util as classu -import railrl.torch.pytorch_util as ptu +import rlkit.torch.core as core +import rlkit.policies.base as base +import rlkit.misc.class_util as classu +import rlkit.torch.pytorch_util as ptu class ContinuousBernoulliMlpPolicy(base.Policy, core.PyTorchModule): def __init__(self, mlp, action_space=None, scale=None, shift=None): diff --git a/rlkit/torch/policy_gradient/gaussian_mlp.py b/rlkit/torch/policy_gradient/gaussian_mlp.py index ffde4dce7..df06771c3 100644 --- a/rlkit/torch/policy_gradient/gaussian_mlp.py +++ b/rlkit/torch/policy_gradient/gaussian_mlp.py @@ -1,5 +1,5 @@ import torch -from railrl.torch.core import PyTorchModule +from rlkit.torch.core import PyTorchModule import numpy as np class GaussianMlp(PyTorchModule): diff --git a/rlkit/torch/policy_gradient/gaussian_mlp_policy.py b/rlkit/torch/policy_gradient/gaussian_mlp_policy.py index 3fc89048f..3f66431d1 100644 --- a/rlkit/torch/policy_gradient/gaussian_mlp_policy.py +++ b/rlkit/torch/policy_gradient/gaussian_mlp_policy.py @@ -1,8 +1,8 @@ import torch -from railrl.torch.core import PyTorchModule +from rlkit.torch.core import PyTorchModule import numpy as np -from railrl.policies.base import Policy -import railrl.torch.pytorch_util as ptu +from rlkit.policies.base import Policy +import rlkit.torch.pytorch_util as ptu import torch.distributions as tdist class GaussianMlpPolicy(Policy, PyTorchModule): diff --git a/rlkit/torch/policy_gradient/gaussian_mlp_policy_fixed_var.py b/rlkit/torch/policy_gradient/gaussian_mlp_policy_fixed_var.py index be267a2bc..635fd503e 100644 --- a/rlkit/torch/policy_gradient/gaussian_mlp_policy_fixed_var.py +++ b/rlkit/torch/policy_gradient/gaussian_mlp_policy_fixed_var.py @@ -2,9 +2,9 @@ import numpy as np import pdb -from railrl.torch.core import PyTorchModule -from railrl.policies.base import Policy -import railrl.torch.pytorch_util as ptu +from rlkit.torch.core import PyTorchModule +from rlkit.policies.base import Policy +import rlkit.torch.pytorch_util as ptu import torch.distributions as tdist class GaussianMlpPolicyFixedVar(Policy, PyTorchModule): diff --git a/rlkit/torch/policy_gradient/parameter.py b/rlkit/torch/policy_gradient/parameter.py index ff8305913..015128780 100644 --- a/rlkit/torch/policy_gradient/parameter.py +++ b/rlkit/torch/policy_gradient/parameter.py @@ -1,6 +1,6 @@ import numpy as np import torch.nn as nn -import railrl.torch.pytorch_util as ptu +import rlkit.torch.pytorch_util as ptu import torch class Parameter(nn.Module): diff --git a/rlkit/torch/policy_gradient/ppo_algorithm.py b/rlkit/torch/policy_gradient/ppo_algorithm.py index 9a2909ca3..0eaad9f02 100644 --- a/rlkit/torch/policy_gradient/ppo_algorithm.py +++ b/rlkit/torch/policy_gradient/ppo_algorithm.py @@ -1,8 +1,8 @@ -from railrl.torch.torch_rl_algorithm import TorchBatchRLAlgorithm +from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm from torch.utils.tensorboard import SummaryWriter -from railrl.core import logger +from rlkit.core import logger class PPOAlgorithm(TorchBatchRLAlgorithm): diff --git a/rlkit/torch/policy_gradient/ppo_trainer.py b/rlkit/torch/policy_gradient/ppo_trainer.py index 00ebe9640..e3c0a8d7a 100644 --- a/rlkit/torch/policy_gradient/ppo_trainer.py +++ b/rlkit/torch/policy_gradient/ppo_trainer.py @@ -3,9 +3,9 @@ import os import torch.optim as optim -import railrl.torch.pytorch_util as ptu -import railrl.core.pylogging -from railrl.torch.torch_rl_algorithm import TorchTrainer +import rlkit.torch.pytorch_util as ptu +import rlkit.core.pylogging +from rlkit.torch.torch_rl_algorithm import TorchTrainer import torch log = logging.getLogger(os.path.basename(__name__)) diff --git a/rlkit/torch/policy_gradient/ppo_trainer_v2.py b/rlkit/torch/policy_gradient/ppo_trainer_v2.py index 2b3381d1c..c1931b0dc 100644 --- a/rlkit/torch/policy_gradient/ppo_trainer_v2.py +++ b/rlkit/torch/policy_gradient/ppo_trainer_v2.py @@ -5,9 +5,9 @@ import torch.nn import numpy as np -import railrl.torch.pytorch_util as ptu -import railrl.core.pylogging -from railrl.torch.torch_rl_algorithm import TorchTrainer +import rlkit.torch.pytorch_util as ptu +import rlkit.core.pylogging +from rlkit.torch.torch_rl_algorithm import TorchTrainer import torch import pdb log = logging.getLogger(os.path.basename(__name__)) diff --git a/rlkit/torch/policy_gradient/vpg_trainer.py b/rlkit/torch/policy_gradient/vpg_trainer.py index 26a0bcb4a..b676fc817 100644 --- a/rlkit/torch/policy_gradient/vpg_trainer.py +++ b/rlkit/torch/policy_gradient/vpg_trainer.py @@ -4,11 +4,11 @@ import torch.optim as optim import railrl.torch.pytorch_util as ptu -from railrl.core import logger -from railrl.data_management.env_replay_buffer import VPGEnvReplayBuffer -from railrl.torch.core import np_to_pytorch_batch -from railrl.torch.distributions import TanhNormal -from railrl.torch.torch_rl_algorithm import TorchBatchRLAlgorithm, TorchTrainer +from rlkit.core import logger +from rlkit.data_management.env_replay_buffer import VPGEnvReplayBuffer +from rlkit.torch.core import np_to_pytorch_batch +from rlkit.torch.distributions import TanhNormal +from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm, TorchTrainer import torch class VPGTrainer(TorchTrainer): From 44ed6c2961021df111181fe6a39e55338d94e7ca Mon Sep 17 00:00:00 2001 From: Neo-X Date: Wed, 7 Apr 2021 09:52:56 -0700 Subject: [PATCH 10/18] Updating setup build --- requirements.txt | 2 +- setup.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 439c6be62..7495682a0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,7 +22,7 @@ awscli==1.11.179 boto3==1.4.8 ray==0.2.2 path.py==10.3.1 -http://download.pytorch.org/whl/cu90/torch-0.4.1-cp35-cp35m-linux_x86_64.whl +torch==0.4.1 joblib==0.9.4 opencv-python==3.4.0.12 torchvision==0.2.0 diff --git a/setup.py b/setup.py index d52ae1f1c..391bbbd9a 100644 --- a/setup.py +++ b/setup.py @@ -7,4 +7,5 @@ packages=find_packages(), license='MIT License', long_description=open('README.md').read(), + install_requires=open('requirements.txt').read(), ) \ No newline at end of file From b5af8a8062312d39da34dfc96db33031df553512 Mon Sep 17 00:00:00 2001 From: Neo-X Date: Wed, 7 Apr 2021 10:19:15 -0700 Subject: [PATCH 11/18] adding module --- rlkit/util/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 rlkit/util/__init__.py diff --git a/rlkit/util/__init__.py b/rlkit/util/__init__.py new file mode 100644 index 000000000..e69de29bb From b5215e37d9d0e4abf0435fa7094f49e7ecc46ed7 Mon Sep 17 00:00:00 2001 From: Glen Date: Sat, 22 May 2021 12:43:11 -0700 Subject: [PATCH 12/18] Update requirements.txt Removing unneed libraries. --- requirements.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 7495682a0..0027fe55b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,9 +18,9 @@ cached-property==1.3.1 gym==0.10.5 gitpython==2.1.7 gtimer==1.0.0b5 -awscli==1.11.179 +# awscli==1.11.179 boto3==1.4.8 -ray==0.2.2 +# ray==0.2.2 path.py==10.3.1 torch==0.4.1 joblib==0.9.4 @@ -29,4 +29,4 @@ torchvision==0.2.0 sk-video==1.1.10 # git+https://github.com/vitchyr/multiworld.git moviepy -comet_ml +# comet_ml From f57bc2afd28f2a0055bcb45ed9f42ba5ffe81177 Mon Sep 17 00:00:00 2001 From: Glen Date: Sat, 22 May 2021 13:08:08 -0700 Subject: [PATCH 13/18] Update requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 0027fe55b..89ccb40d7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,7 +13,7 @@ scipy>=0.18.0 sphinx sphinx_rtd_theme numpydoc -cloudpickle==0.5.2 +cloudpickle==1.3.0 cached-property==1.3.1 gym==0.10.5 gitpython==2.1.7 From 662833a5d2ac617978c75def80c7ca144a4163ec Mon Sep 17 00:00:00 2001 From: Glen Date: Sat, 22 May 2021 13:08:35 -0700 Subject: [PATCH 14/18] Update requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 89ccb40d7..758af177a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,7 @@ sphinx_rtd_theme numpydoc cloudpickle==1.3.0 cached-property==1.3.1 -gym==0.10.5 +gym==0.17.3 gitpython==2.1.7 gtimer==1.0.0b5 # awscli==1.11.179 From b95839a0163abd4656ced6827ca03e82836fe742 Mon Sep 17 00:00:00 2001 From: Neo-X Date: Sat, 22 May 2021 13:10:41 -0700 Subject: [PATCH 15/18] Adding packages --- rlkit/envs/__init__.py | 0 rlkit/envs/goal_generation/__init__.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 rlkit/envs/__init__.py create mode 100644 rlkit/envs/goal_generation/__init__.py diff --git a/rlkit/envs/__init__.py b/rlkit/envs/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/rlkit/envs/goal_generation/__init__.py b/rlkit/envs/goal_generation/__init__.py new file mode 100644 index 000000000..e69de29bb From 081925cfff96fbb9d14f415ea87424dedb91d50c Mon Sep 17 00:00:00 2001 From: Glen Date: Sat, 22 May 2021 20:15:42 -0700 Subject: [PATCH 16/18] Update requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 758af177a..aafa63651 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,7 +24,7 @@ boto3==1.4.8 path.py==10.3.1 torch==0.4.1 joblib==0.9.4 -opencv-python==3.4.0.12 +opencv-python==3.4.2.17 torchvision==0.2.0 sk-video==1.1.10 # git+https://github.com/vitchyr/multiworld.git From 6aeee7756a8ca1d2a46f6641472a4d4a33979d16 Mon Sep 17 00:00:00 2001 From: Glen Date: Sat, 22 May 2021 20:25:37 -0700 Subject: [PATCH 17/18] Update requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index aafa63651..758af177a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,7 +24,7 @@ boto3==1.4.8 path.py==10.3.1 torch==0.4.1 joblib==0.9.4 -opencv-python==3.4.2.17 +opencv-python==3.4.0.12 torchvision==0.2.0 sk-video==1.1.10 # git+https://github.com/vitchyr/multiworld.git From 76aea717c1c9c7ef6b18902bb617d427c19a9928 Mon Sep 17 00:00:00 2001 From: KBoumghar <63490201+KBoumghar@users.noreply.github.com> Date: Tue, 28 Jun 2022 15:33:44 -0400 Subject: [PATCH 18/18] Update requirements.txt --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 758af177a..4762a7610 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,9 +22,9 @@ gtimer==1.0.0b5 boto3==1.4.8 # ray==0.2.2 path.py==10.3.1 -torch==0.4.1 +torch==1.6.0 joblib==0.9.4 -opencv-python==3.4.0.12 +opencv-python==4.3.0.36 torchvision==0.2.0 sk-video==1.1.10 # git+https://github.com/vitchyr/multiworld.git