merge w/ 2.0

xinpw8 · Jan 5, 2025 · 9f8505f · 9f8505f
2 parents 7098050 + c543e3d
commit 9f8505f
Show file tree

Hide file tree

Showing 174 changed files with 7,481 additions and 1,108 deletions.
diff --git a/.github/workflows/install.yml b/.github/workflows/install.yml
@@ -0,0 +1,29 @@
+name: install
+on:
+  push:
+  pull_request:
+
+jobs:
+  test:
+    name: test ${{ matrix.py }} - ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os:
+          - ubuntu-latest
+          - macos-latest
+        py:
+          - "3.11"
+          - "3.10"
+          - "3.9"
+    steps:
+      - name: Setup python for test ${{ matrix.py }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.py }}
+      - uses: actions/checkout@v3
+      - name: Upgrade pip
+        run: python -m pip install -U pip
+      - name: Install pufferlib
+        run: pip3 install -e . 
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,13 @@
+# Annoying temp files generated by Cython
+c_gae.c
+pufferlib/extensions.c
+pufferlib/ocean/grid/c_grid.c
+pufferlib/ocean/tactical/c_tactical.c
+pufferlib/puffernet.c
+
+# Raylib
+raylib_wasm/
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,2 +1,6 @@
 global-include *.pyx
 global-include *.pxd
+global-include *.h
+global-include *.py
+recursive-include pufferlib/resources *
+
diff --git a/README.md b/README.md
@@ -1,39 +1,16 @@
 ![figure](https://pufferai.github.io/source/resource/header.png)
 
 [![PyPI version](https://badge.fury.io/py/pufferlib.svg)](https://badge.fury.io/py/pufferlib)
+![PyPI - Python Version](https://img.shields.io/pypi/pyversions/pufferlib)
+![Github Actions](https://github.com/PufferAI/PufferLib/actions/workflows/install.yml/badge.svg)
 [![](https://dcbadge.vercel.app/api/server/spT4huaGYV?style=plastic)](https://discord.gg/spT4huaGYV)
 [![Twitter](https://img.shields.io/twitter/url/https/twitter.com/cloudposse.svg?style=social&label=Follow%20%40jsuarez5341)](https://twitter.com/jsuarez5341)
 
-You have an environment, a PyTorch model, and a reinforcement learning framework that are designed to work together but don’t. PufferLib is a wrapper layer that makes RL on complex game environments as simple as RL on Atari. You write a native PyTorch network and a short binding for your environment; PufferLib takes care of the rest.
+PufferLib is the reinforcement learning library I wish existed during my PhD. It started as a compatibility layer to make working with complex environments a breeze. Now, it's a high-performance toolkit for research and industry with optimized parallel simulation, environments that run and train at 1M+ steps/second, and tons of quality of life improvements for practitioners. All our tools are free and open source. We also offer priority service for companies, startups, and labs!
 
-All of our [Documentation](https://pufferai.github.io "PufferLib Documentation") is hosted by github.io. @jsuarez5341 on [Discord](https://discord.gg/spT4huaGYV) for support -- post here before opening issues. I am also looking for contributors interested in adding bindings for other environments and RL frameworks.
+![Trailer](https://github.com/PufferAI/puffer.ai/blob/main/docs/assets/puffer_2.gif?raw=true)
 
-## Demo
-
-The current `demo.py` is a souped-up version of CleanRL PPO with optimized LSTM support, detailed performance metrics, a local dashboard, async envpool sampling, checkpointing, wandb sweeps, and more. It has a powerful `--help` that generates options based on the specified environment and policy. Hyperparams are in `config.yaml`. A few examples:
-
-```
-# Train minigrid with multiprocessing. Save it as a baseline.
-python demo.py --env minigrid --mode train --vec multiprocessing
-```
-
-![figure](https://raw.githubusercontent.com/PufferAI/pufferai.github.io/1.0/docs/source/resource/puffer-dash.png)
-
-```
-# Load the current minigrid baseline and render it locally
-python demo.py --env minigrid --mode eval --baseline
-
-# Train squared with serial vectorization and save it as a wandb baseline
-# The, load the current squared baseline and render it locally
-python demo.py --env squared --mode train --baseline
-python demo.py --env squared --mode eval --baseline
-
-# Render NMMO locally with a random policy
-python demo.py --env nmmo --mode eval
-
-# Autotune vectorization settings for your machine
-python demo.py --env breakout --mode autotune
-```
+All of our documentation is hosted at [puffer.ai](https://puffer.ai "PufferLib Documentation"). @jsuarez5341 on [Discord](https://discord.gg/puffer) for support -- post here before opening issues. We're always looking for new contributors, too!
 
 ## Star to puff up the project!
 

diff --git a/clean_pufferl.py b/clean_pufferl.py
@@ -126,21 +126,6 @@ def evaluate(data):
             data.vecenv.send(actions)
 
     with profile.eval_misc:
-        # Moves into models... maybe. Definitely moves.
-        # You could also just return infos and have it in demo
-        if 'pokemon_exploration_map' in infos:
-            for pmap in infos['pokemon_exploration_map']:
-                if not hasattr(data, 'pokemon_map'):
-                    import pokemon_red_eval
-                    data.map_updater = pokemon_red_eval.map_updater()
-                    data.pokemon_map = pmap
-
-                data.pokemon_map = np.maximum(data.pokemon_map, pmap)
-
-            if len(infos['pokemon_exploration_map']) > 0:
-                rendered = data.map_updater(data.pokemon_map)
-                data.stats['Media/exploration_map'] = data.wandb.Image(rendered)
-
         for k, v in infos.items():
             if '_map' in k and data.wandb is not None:
                 data.stats[f'Media/{k}'] = data.wandb.Image(v[0])
@@ -518,13 +503,13 @@ def __init__(self, delay=1, maxlen=20):
 
     def run(self):
         while not self.stopped:
-            self.cpu_util.append(psutil.cpu_percent())
+            self.cpu_util.append(100*psutil.cpu_percent())
             mem = psutil.virtual_memory()
-            self.cpu_mem.append(mem.active / mem.total)
+            self.cpu_mem.append(100*mem.active/mem.total)
             if torch.cuda.is_available():
                 self.gpu_util.append(torch.cuda.utilization())
                 free, total = torch.cuda.mem_get_info()
-                self.gpu_mem.append(free / total)
+                self.gpu_mem.append(100*free/total)
             else:
                 self.gpu_util.append(0)
                 self.gpu_mem.append(0)
@@ -598,7 +583,7 @@ def rollout(env_creator, env_kwargs, policy_cls, rnn_cls, agent_creator, agent_k
 
     frames = []
     tick = 0
-    while tick <= 1000:
+    while tick <= 2000:
         if tick % 1 == 0:
             render = driver.render()
             if driver.render_mode == 'ansi':
@@ -703,7 +688,7 @@ def print_dashboard(env_name, utilization, global_step, epoch,
     table.add_column(justify="center", width=13)
     table.add_column(justify="right", width=13)
     table.add_row(
-        f':blowfish: {c1}PufferLib {b2}1.0.0',
+        f':blowfish: {c1}PufferLib {b2}2.0.0',
         f'{c1}CPU: {c3}{cpu_percent:.1f}%',
         f'{c1}GPU: {c3}{gpu_percent:.1f}%',
         f'{c1}DRAM: {c3}{dram_percent:.1f}%',

diff --git a/config/ocean/go.ini b/config/ocean/go.ini
@@ -1,25 +1,53 @@
 [base]
 package = ocean
 env_name = puffer_go
-policy_name = Policy
+policy_name = Go
 rnn_name = Recurrent
 
 [env]
-num_envs = 512
+num_envs = 2048
+reward_move_pass = -0.47713279724121094
+reward_move_valid = 0
+reward_move_invalid = -0.47179355621337893
+reward_opponent_capture = -0.5240603446960449
+reward_player_capture = 0.22175729274749756
+grid_size = 7
 
 [train]
-total_timesteps = 150_000_000
+total_timesteps = 2_000_000_000
 checkpoint_interval = 50
 num_envs = 2
 num_workers = 2
 env_batch_size =1
-batch_size = 131072
-update_epochs = 4
-minibatch_size = 32768
+batch_size = 524288
+update_epochs = 1
+minibatch_size = 131072
 bptt_horizon = 16
-learning_rate = 0.002
+learning_rate = 0.0015
+ent_coef = 0.013460194258584548
+gae_lambda = 0.90
+gamma = 0.95
+max_grad_norm = 0.8140400052070618
+vf_coef = 0.48416485817685223
 anneal_lr = False
-device = cuda
+device = cpu
 
+[sweep.parameters.env.parameters.reward_move_invalid]
+distribution = uniform
+min = -1.0
+max = 0.0
 
+[sweep.parameters.env.parameters.reward_move_pass]
+distribution = uniform
+min = -1.0
+max = 0.0
 
+[sweep.parameters.env.parameters.reward_player_capture]
+distribution = uniform
+min = 0.0
+max = 1.0
+
+[sweep.parameters.env.parameters.reward_opponent_capture]
+distribution = uniform
+min = -1.0
+max = 0.0
diff --git a/config/nmmo3.ini → config/ocean/nmmo3.ini b/config/nmmo3.ini → config/ocean/nmmo3.ini
@@ -1,19 +1,20 @@
 [base]
-package = nmmo3
+package = ocean
 env_name = nmmo3
-rnn_name = Recurrent
+policy_name = NMMO3
+rnn_name = NMMO3LSTM
 
 [train]
-total_timesteps = 10000000000
+total_timesteps = 107000000000
 checkpoint_interval = 1000
-learning_rate = 0.000972332726277282
+learning_rate = 0.0004573146765703167
 num_envs = 2
 num_workers = 2
 env_batch_size = 1
 update_epochs = 1
-gamma = 0.8602833367538562
-gae_lambda = 0.916381394950097
-ent_coef = 0.004758679104391214
+gamma = 0.7647543366891623
+gae_lambda = 0.996005622445478
+ent_coef = 0.01210084358004069
 max_grad_norm = 0.6075578331947327
 vf_coef = 0.3979089612467003
 bptt_horizon = 16
@@ -23,12 +24,11 @@ compile = False
 anneal_lr = False
 
 [env]
-reward_combat_level = 1.305025339126587
-reward_prof_level = 1.1842153072357178
-reward_item_level = 1.0236146450042725
+reward_combat_level = 2.9437930583953857
+reward_prof_level = 1.445250153541565
+reward_item_level = 1.3669428825378418
 reward_market = 0
-#reward_market = 0.23154078423976895
-reward_death_mmo = -1.033899426460266
+reward_death = -2.46451187133789
 
 [sweep.metric]
 goal = maximize
@@ -49,17 +49,13 @@ distribution = uniform
 min = 0.0
 max = 5.0
 
-[sweep.parameters.env.parameters.reward_market]
-distribution = uniform
-min = 0.0
-max = 1.0
-
 [sweep.parameters.env.parameters.reward_death_mmo]
 distribution = uniform
 min = -5.0
 max = 0.0
 
 [sweep.parameters.train.parameters.total_timesteps]
 distribution = uniform
-min = 300_000_000
-max = 100_000_000_000
+min = 1_000_000_000
+max = 10_000_000_000
+
diff --git a/config/ocean/pysquared.ini b/config/ocean/pysquared.ini
@@ -0,0 +1,21 @@
+[base]
+package = ocean
+env_name = puffer_pysquared
+policy_name = Policy
+rnn_name = Recurrent
+
+[env]
+num_envs = 1
+
+[train]
+total_timesteps = 40_000_000
+checkpoint_interval = 50
+num_envs = 12288
+num_workers = 12
+env_batch_size = 4096
+batch_size = 131072
+update_epochs = 1
+minibatch_size = 8192
+learning_rate = 0.0017
+anneal_lr = False
+device = cuda
diff --git a/config/ocean/trash_pickup.ini b/config/ocean/trash_pickup.ini
@@ -0,0 +1,64 @@
+[base]
+package = ocean
+env_name = trash_pickup puffer_trash_pickup 
+policy_name = TrashPickup
+rnn_name = Recurrent
+
+[env]
+num_envs = 1024  # Recommended: 4096 (recommended start value) / num_agents 
+grid_size = 10
+num_agents = 4
+num_trash = 20
+num_bins = 1
+max_steps = 150
+report_interval = 32
+agent_sight_range = 5 # only used with 2D local crop obs space
+
+[train]
+total_timesteps = 100_000_000
+checkpoint_interval = 200
+num_envs = 2
+num_workers = 2
+env_batch_size = 1
+batch_size = 131072
+update_epochs = 1
+minibatch_size = 16384
+bptt_horizon = 8
+anneal_lr = False
+device = cuda
+learning_rate=0.001
+gamma = 0.95
+gae_lambda = 0.85
+vf_ceof = 0.4
+clip_coef = 0.1
+vf_clip_coef = 0.1
+ent_coef = 0.01
+
+[sweep.metric]
+goal = maximize
+name = environment/episode_return
+
+[sweep.parameters.train.parameters.learning_rate]
+distribution = log_uniform_values
+min = 0.000001
+max = 0.01
+
+[sweep.parameters.train.parameters.gamma]
+distribution = uniform
+min = 0
+max = 1
+
+[sweep.parameters.train.parameters.gae_lambda]
+distribution = uniform
+min = 0
+max = 1
+
+[sweep.parameters.train.parameters.update_epochs]
+distribution = int_uniform
+min = 1
+max = 4
+
+[sweep.parameters.train.parameters.ent_coef]
+distribution = log_uniform_values
+min = 1e-5
+max = 1e-1
diff --git a/demo.py b/demo.py
@@ -430,7 +430,7 @@ def train(args, make_env, policy_cls, rnn_cls, wandb,
     rnn_name = args['base']['rnn_name']
     rnn_cls = None
     if rnn_name is not None:
-        rnn_cls = getattr(env_module, args['base']['rnn_name'])
+        rnn_cls = getattr(env_module.torch, args['base']['rnn_name'])
 
     if args['baseline']:
         assert args['mode'] in ('train', 'eval', 'evaluate')