diff --git a/.dockerignore b/.dockerignore
index 9c731133..c1591543 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -143,4 +143,5 @@ cython_debug/
 **/.DS_Store
 
 **/log
-**/*.qdrep
\ No newline at end of file
+**/*.qdrep
+!bmtrain/dist
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index 47a9427f..a9c0b301 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -12,8 +12,9 @@ RUN apt install iputils-ping opensm libopensm-dev libibverbs1 libibverbs-dev -y
 ENV TORCH_CUDA_ARCH_LIST=6.1;7.0;7.5
 ENV BMP_AVX512=1
 ADD other_requirements.txt other_requirements.txt
-RUN pip3 install -r other_requirements.txt
-RUN pip3 install bmtrain
+RUN pip3 install --upgrade pip && pip3 install -r other_requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
+ADD . .
+RUN python3 setup.py install
 
 WORKDIR /root
 ADD example example
\ No newline at end of file
diff --git a/bmtrain/__init__.py b/bmtrain/__init__.py
index efbf1fc5..3e025846 100644
--- a/bmtrain/__init__.py
+++ b/bmtrain/__init__.py
@@ -19,4 +19,4 @@
 from . import inspect
 from . import lr_scheduler
 from . import loss
-from . import dist
+from . import distributed
diff --git a/bmtrain/dist/__init__.py b/bmtrain/distributed/__init__.py
similarity index 100%
rename from bmtrain/dist/__init__.py
rename to bmtrain/distributed/__init__.py
diff --git a/bmtrain/dist/ops.py b/bmtrain/distributed/ops.py
similarity index 90%
rename from bmtrain/dist/ops.py
rename to bmtrain/distributed/ops.py
index 2cd69bdf..7a37d158 100644
--- a/bmtrain/dist/ops.py
+++ b/bmtrain/distributed/ops.py
@@ -1,4 +1,3 @@
-from typing import Literal
 import torch
 from ..global_var import config
 from ..nccl import allGather as ncclAllGather
@@ -30,7 +29,7 @@ def all_gather(x : torch.Tensor):
 
 class OpAllReduce(torch.autograd.Function):
     @staticmethod
-    def forward(ctx, input : torch.Tensor, op: Literal['sum', 'prod', 'max', 'min', 'avg']):
+    def forward(ctx, input : torch.Tensor, op : str):
         if not input.contiguous():
             input = input.contiguous()
         output = torch.empty( input.size(), dtype=input.dtype, device=input.device)
@@ -64,7 +63,7 @@ def backward(ctx, grad_output):
         else:
             return grad_output * ctx.saved_tensors[0], None
 
-def all_reduce(x : torch.Tensor, op: Literal['sum', 'prod', 'max', 'min', 'avg']):
+def all_reduce(x : torch.Tensor, op : str = "sum"):
     assert x.is_cuda
     return OpAllReduce.apply(x, op)
 
diff --git a/other_requirements.txt b/other_requirements.txt
index 359486b4..6654b1ac 100644
--- a/other_requirements.txt
+++ b/other_requirements.txt
@@ -1,3 +1,6 @@
 tqdm
 cpm_kernels>=1.0.11
-jieba
\ No newline at end of file
+jieba
+tensorboard
+setuptools_rust
+transformers
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 657e8664..283d4b73 100644
--- a/setup.py
+++ b/setup.py
@@ -32,7 +32,6 @@ def get_avx_flags():
     install_requires=[
         "torch>=1.10",
         "numpy",
-        "tensorboard"
     ],
     ext_modules=[
         CUDAExtension('bmtrain.nccl._C', [
diff --git a/tests/test_dist.py b/tests/test_dist.py
index a14f5481..c6844ee3 100644
--- a/tests/test_dist.py
+++ b/tests/test_dist.py
@@ -4,7 +4,7 @@
 def main():
     bmt.init_distributed()
     x = torch.full((1,), bmt.rank() + 1, dtype=torch.half, device="cuda").requires_grad_(True)
-    y = bmt.dist.all_reduce(x, "prod").view(-1)
+    y = bmt.distributed.all_reduce(x, "prod").view(-1)
     bmt.print_rank(y)
     loss = (y * y).sum() / 2
     loss.backward()