diff --git a/.dockerignore b/.dockerignore index 9c731133..c1591543 100644 --- a/.dockerignore +++ b/.dockerignore @@ -143,4 +143,5 @@ cython_debug/ **/.DS_Store **/log -**/*.qdrep \ No newline at end of file +**/*.qdrep +!bmtrain/dist \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 47a9427f..a9c0b301 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,8 +12,9 @@ RUN apt install iputils-ping opensm libopensm-dev libibverbs1 libibverbs-dev -y ENV TORCH_CUDA_ARCH_LIST=6.1;7.0;7.5 ENV BMP_AVX512=1 ADD other_requirements.txt other_requirements.txt -RUN pip3 install -r other_requirements.txt -RUN pip3 install bmtrain +RUN pip3 install --upgrade pip && pip3 install -r other_requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple +ADD . . +RUN python3 setup.py install WORKDIR /root ADD example example \ No newline at end of file diff --git a/bmtrain/__init__.py b/bmtrain/__init__.py index efbf1fc5..3e025846 100644 --- a/bmtrain/__init__.py +++ b/bmtrain/__init__.py @@ -19,4 +19,4 @@ from . import inspect from . import lr_scheduler from . import loss -from . import dist +from . import distributed diff --git a/bmtrain/dist/__init__.py b/bmtrain/distributed/__init__.py similarity index 100% rename from bmtrain/dist/__init__.py rename to bmtrain/distributed/__init__.py diff --git a/bmtrain/dist/ops.py b/bmtrain/distributed/ops.py similarity index 90% rename from bmtrain/dist/ops.py rename to bmtrain/distributed/ops.py index 2cd69bdf..7a37d158 100644 --- a/bmtrain/dist/ops.py +++ b/bmtrain/distributed/ops.py @@ -1,4 +1,3 @@ -from typing import Literal import torch from ..global_var import config from ..nccl import allGather as ncclAllGather @@ -30,7 +29,7 @@ def all_gather(x : torch.Tensor): class OpAllReduce(torch.autograd.Function): @staticmethod - def forward(ctx, input : torch.Tensor, op: Literal['sum', 'prod', 'max', 'min', 'avg']): + def forward(ctx, input : torch.Tensor, op : str): if not input.contiguous(): input = input.contiguous() output = torch.empty( input.size(), dtype=input.dtype, device=input.device) @@ -64,7 +63,7 @@ def backward(ctx, grad_output): else: return grad_output * ctx.saved_tensors[0], None -def all_reduce(x : torch.Tensor, op: Literal['sum', 'prod', 'max', 'min', 'avg']): +def all_reduce(x : torch.Tensor, op : str = "sum"): assert x.is_cuda return OpAllReduce.apply(x, op) diff --git a/other_requirements.txt b/other_requirements.txt index 359486b4..6654b1ac 100644 --- a/other_requirements.txt +++ b/other_requirements.txt @@ -1,3 +1,6 @@ tqdm cpm_kernels>=1.0.11 -jieba \ No newline at end of file +jieba +tensorboard +setuptools_rust +transformers \ No newline at end of file diff --git a/setup.py b/setup.py index 657e8664..283d4b73 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,6 @@ def get_avx_flags(): install_requires=[ "torch>=1.10", "numpy", - "tensorboard" ], ext_modules=[ CUDAExtension('bmtrain.nccl._C', [ diff --git a/tests/test_dist.py b/tests/test_dist.py index a14f5481..c6844ee3 100644 --- a/tests/test_dist.py +++ b/tests/test_dist.py @@ -4,7 +4,7 @@ def main(): bmt.init_distributed() x = torch.full((1,), bmt.rank() + 1, dtype=torch.half, device="cuda").requires_grad_(True) - y = bmt.dist.all_reduce(x, "prod").view(-1) + y = bmt.distributed.all_reduce(x, "prod").view(-1) bmt.print_rank(y) loss = (y * y).sum() / 2 loss.backward()