Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve throughput of computing embeddings with BetterTransformer #15

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .github/workflows/base.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,4 @@ jobs:
pip install -e '.[dev]'
- name: Run tests
run: |
pytest --cov="crossfit"

pytest -m "not (singlegpu or multigpu)" --cov="crossfit"
4 changes: 2 additions & 2 deletions .github/workflows/cf_backends.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
python setup.py develop
- name: Run Pytorch tests
run: |
pytest --cov="crossfit/array" -m "pytorch"
pytest --cov="crossfit/array" -m "pytorch and not (singlegpu or multigpu)"

# jax:
# runs-on: ${{ matrix.os }}
Expand Down Expand Up @@ -66,4 +66,4 @@ jobs:
# python setup.py develop
# - name: Run Jax tests
# run: |
# pytest --cov="crossfit/array" -m "jax"
# pytest --cov="crossfit/array" -m "jax"
31 changes: 28 additions & 3 deletions .github/workflows/gpu-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,33 @@ jobs:
- uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Install dependencies
run: |
python -m pip install "torch${{ matrix.torch-version }}"
python -m pip install .[pytorch-dev]
- name: Run tests
run: |
python -m pip install -r requirements/base.txt
python -m pip install -r requirements/pytorch.txt
pytest tests/
pytest -m singlegpu tests/

#multi-gpu-ci:
# runs-on: linux-amd64-gpu-p100-latest-2
# container:
# image: nvcr.io/nvidia/pytorch:23.09-py3
# env:
# NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
# options: --shm-size=1G
# credentials:
# username: $oauthtoken
# password: ${{ secrets.NGC_TOKEN }}
#
# steps:
# - uses: actions/checkout@v3
# with:
# fetch-depth: 0
# - name: Install dependencies
# run: |
# python -m pip install "torch${{ matrix.torch-version }}"
# python -m pip install .[pytorch-dev]
# - name: Run tests
# run: |
# pytest -m multigpu tests/
37 changes: 33 additions & 4 deletions crossfit/backend/torch/hf/model.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
from functools import lru_cache
import gc
import logging
import os
from crossfit.dataset.home import CF_HOME
import joblib

import numpy as np
import torch
import torch.nn.functional as F
from tqdm import tqdm
from transformers import AutoConfig, AutoModel, AutoTokenizer
from sklearn.linear_model import LinearRegression
from crossfit.backend.torch.model import Model

logger = logging.getLogger(__name__)


class HFModel(Model):
def __init__(self, path_or_name: str, max_mem_gb: int = 16, training=False):
Expand Down Expand Up @@ -67,7 +71,7 @@ def fit_memory_estimate_curve(self, model=None):
}

try:
outputs = model(batch)
outputs = model(**batch)
memory_used = torch.cuda.max_memory_allocated() / (1024**2) # Convert to MB
X.append([batch_size, seq_len, seq_len**2])
y.append(memory_used)
Expand Down Expand Up @@ -100,9 +104,34 @@ def max_seq_length(self) -> int:

class SentenceTransformerModel(HFModel):
def load_model(self, device="cuda"):
from sentence_transformers import SentenceTransformer

return SentenceTransformer(self.path_or_name, device="cuda").to(device)
model = AutoModel.from_pretrained(self.path_or_name).to(device)
if device == "cuda":
try:
from optimum.bettertransformer import BetterTransformer

model = BetterTransformer.transform(model.to(torch.float16))
except ImportError:
logging.warning(
"Loading embedding model without BetterTransformer. "
"Install the 'optimum' to make embedding inference faster. "
)
return model

def get_embedding(self, inputs, outputs):
embeddings = self.average_pool(
outputs["last_hidden_state"], inputs["attention_mask"]
)
embeddings = F.normalize(embeddings, p=2, dim=1)
return embeddings

@staticmethod
def average_pool(
last_hidden_states: torch.Tensor, attention_mask: torch.Tensor
) -> torch.Tensor:
last_hidden = last_hidden_states.masked_fill(
~attention_mask[..., None].bool(), 0.0
)
return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

def load_cfg(self):
return AutoConfig.from_pretrained("sentence-transformers/" + self.path_or_name)
10 changes: 6 additions & 4 deletions crossfit/backend/torch/op/embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@ def __init__(
pre=None,
cols=False,
keep_cols=None,
default_batch_size=1024,
batch_size=1024,
max_mem: str = "16GB",
sorted_data_loader: bool = True,
):
super().__init__(pre=pre, cols=cols, keep_cols=keep_cols)
self.model = model
self.batch_size = default_batch_size
self.batch_size = batch_size
self.max_mem = max_mem
self.max_mem_gb = int(self.max_mem.split("GB")[0]) / 2.5
self.sorted_data_loader = sorted_data_loader
Expand All @@ -50,8 +50,10 @@ def call(self, data, partition_info=None):
)

all_embeddings_ls = []
for output in loader.map(self.model.get_model(self)):
all_embeddings_ls.append(output["sentence_embedding"])
model = self.model.get_model(self)
for features, outputs in loader.map(lambda batch: (batch, model(**batch))):
sentence_embedding = self.model.get_embedding(features, outputs)
all_embeddings_ls.append(sentence_embedding)

out = cudf.DataFrame(index=index)
embedding = cp.asarray(torch.vstack(all_embeddings_ls))
Expand Down
7 changes: 6 additions & 1 deletion crossfit/dataset/beir/raw.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,12 @@ def sample_raw(name, out_dir=None, overwrite=False, sample_size=100, blocksize=2
import cudf
import dask_cudf

full_path = download_raw(name, overwrite=overwrite)
# if we are running tests with `pytest tests/`, use testdata.
testdata_dir = os.path.join(os.getcwd(), "tests", "testdata", "beir", name)
if os.path.exists(testdata_dir):
full_path = testdata_dir
else:
full_path = download_raw(name, overwrite=overwrite)

out_dir = out_dir or CF_HOME
sampled_dir = os.path.join(out_dir, "sampled")
Expand Down
3 changes: 1 addition & 2 deletions tests/backend/dask_backend/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,8 @@

from crossfit.calculate.aggregate import Aggregator, metric_key
from crossfit.data.dataframe.dispatch import CrossFrame
from crossfit.metric.continuous.range import Range
from crossfit.metric import Mean

from crossfit.metric.continuous.range import Range
from tests.utils import is_leaf_node_instance_of, sample_df


Expand Down
3 changes: 1 addition & 2 deletions tests/backend/pytorch_backend/test_torch_convert.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import pytest

import numpy as np
import pytest
import torch

from crossfit.data import convert_array
Expand Down
6 changes: 2 additions & 4 deletions tests/backend/test_sklearn.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
import pytest

import numpy as np
import pytest
from sklearn import metrics
from sklearn.utils.multiclass import type_of_target
from sklearn.utils._array_api import get_namespace
from sklearn.utils.multiclass import type_of_target

from crossfit.data import crossarray, np_backend_dispatch


arr1 = [1, 2, 3]
arr2 = [4, 5, 6]

Expand Down
3 changes: 1 addition & 2 deletions tests/data/array/test_conversion.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import pytest

import numpy as np
import pytest

from crossfit.data import convert_array

Expand Down
4 changes: 1 addition & 3 deletions tests/data/array/test_decorator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import pytest

import numpy as np

import pytest

from crossfit.data import crossarray
from crossfit.utils import test_utils
Expand Down
4 changes: 1 addition & 3 deletions tests/data/dataframe/test_dispatch.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
import pytest

import numpy as np
import pytest

from crossfit.backend.pandas.dataframe import PandasDataFrame
from crossfit.data.dataframe.core import ArrayBundle
from crossfit.data.dataframe.dispatch import CrossFrame


def test_pandas_frame():

arr1 = np.arange(10)
arr2 = np.ones(10)
arr3 = np.array(["cat", "dog"] * 5)
Expand Down
30 changes: 30 additions & 0 deletions tests/dataset/test_load.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import pytest

beir = pytest.importorskip("beir")

import os
import random

import crossfit as cf
from crossfit.dataset.beir.raw import BEIR_DATASETS

DATASETS = set(BEIR_DATASETS.keys())
DATASETS.discard("cqadupstack")
DATASETS.discard("germanquad")


@pytest.mark.singlegpu
@pytest.mark.parametrize("dataset", DATASETS)
def test_load_beir(dataset):
data = cf.load_dataset(f"beir/{dataset}", overwrite=True, tiny_sample=True)

for split in ["train", "val", "test"]:
split_data = getattr(data, split)

if split_data is None:
continue

split = split_data.ddf().compute()

assert split["query-index"].nunique() == split["query-id"].nunique()
assert split["query-id"].nunique() <= 100
Empty file.
51 changes: 51 additions & 0 deletions tests/metrics/ranking/test_f1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import pytest

pytest.importorskip("cupy")

from crossfit.data.sparse.ranking import SparseBinaryLabels, SparseRankings
from crossfit.metric.ranking import F1

y1 = [0, 5]
y2 = [8, 9]
y3 = []
y4 = [1, 2, 3, 4, 5, 6]
y5 = [3]
y6 = [0, 1]
y7 = [[]]

r1 = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
r2 = [9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
r3 = []
r4 = [1, 6, 8]

yn1 = [0, 3, 4]
yn2 = [2, 0, 5]

rn1 = [0, 1, 2]
rn2 = [2, 1, 0]


class TestF1:
@pytest.mark.parametrize(
"k,y_gold,y_pred,expect",
[
(3, y2, r1, [0.0]),
(10, y2, r1, [1.0 / 3]),
(2, y2, r2, [1.0]),
(10, y2, r3, [0.0]),
(1, y4, r4, [0.285714286]),
(10, y3, r3, [float("nan")]),
(10, y3, r2, [float("nan")]),
(1, y1, [r1, r2], [2.0 / 3, 0.0]),
(1, [y1, y1], [r1, r2], [2.0 / 3, 0.0]),
],
)
def test_score(self, k, y_gold, y_pred, expect):
y_gold = SparseBinaryLabels.from_positive_indices(y_gold)
if len(y_pred) == 0 or [] in y_pred:
with pytest.warns(UserWarning):
y_pred = SparseRankings.from_ranked_indices(y_pred)
else:
y_pred = SparseRankings.from_ranked_indices(y_pred)
pred = F1(k).score(y_gold, y_pred, nan_handling="propagate").tolist()
assert pred == pytest.approx(expect, nan_ok=True)
54 changes: 54 additions & 0 deletions tests/metrics/ranking/test_hitrate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import pytest

pytest.importorskip("cupy")

from crossfit.data.sparse.ranking import SparseBinaryLabels, SparseRankings
from crossfit.metric.ranking import HitRate

y1 = [0, 5]
y2 = [8, 9]
y3 = []
y4 = [1, 2, 3, 4, 5, 6]
y5 = [3]
y6 = [0, 1]
y7 = [[]]

r1 = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
r2 = [9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
r3 = []
r4 = [1, 6, 8]

yn1 = [0, 3, 4]
yn2 = [2, 0, 5]

rn1 = [0, 1, 2]
rn2 = [2, 1, 0]


class TestHitrate:
@pytest.mark.parametrize(
"k,y_gold,y_pred,expect",
[
(3, y5, r1, [0.0]),
(3, y5, r2, [0.0]),
(4, y5, r1, [1.0]),
(4, y5, r2, [0.0]),
(4, y1, r1, [float("nan")]),
(4, y1, r2, [float("nan")]),
(10, y3, r3, [float("nan")]),
(10, y5, r3, [0.0]),
(10, y3, r2, [float("nan")]),
(5, y5, [r1, r2], [1.0, 0.0]),
(5, [y5, y1], [r1, r2], [1.0, float("nan")]),
],
)
def test_score(self, k, y_gold, y_pred, expect):
y_gold = SparseBinaryLabels.from_positive_indices(y_gold)
if len(y_pred) == 0 or [] in y_pred:
with pytest.warns(UserWarning):
y_pred = SparseRankings.from_ranked_indices(y_pred)
else:
y_pred = SparseRankings.from_ranked_indices(y_pred)
pred = HitRate(k).score(y_gold, y_pred, nan_handling="propagate").tolist()

assert pred == pytest.approx(expect, nan_ok=True)
Loading