From 05fff228826d25c5e7cf4eccdf240c683228c468 Mon Sep 17 00:00:00 2001 From: Miki Tebeka Date: Thu, 14 Feb 2019 18:12:33 +0200 Subject: [PATCH 1/7] CONDA_INSTALL --- _scripts/install-cudf-env.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/_scripts/install-cudf-env.sh b/_scripts/install-cudf-env.sh index 8526eaca..0f95afdb 100755 --- a/_scripts/install-cudf-env.sh +++ b/_scripts/install-cudf-env.sh @@ -23,15 +23,17 @@ mv go goroot echo 'export GOROOT=${HOME}/goroot' >> ~/.bashrc echo 'export PATH=${GOROOT}/bin:${PATH}' >> ~/.bashrc +CONDA_INSTALL="~/miniconda3/bin/conda install -y" + # Install cudf -~/miniconda3/bin/conda install \ +${CONDA_INSTALL} \ -c nvidia -c rapidsai -c pytorch -c numba \ -c conda-forge -c defaults \ cudf=0.5 cuml=0.5 python=3.6 -~/miniconda3/bin/conda install cudatoolkit=9.2 +${CONDA_INSTALL} cudatoolkit=9.2 # Install testing -~/miniconda3/bin/conda install pytest pyyaml +${CONDA_INSTALL} pytest pyyaml # Get frames code git clone https://github.com/v3io/frames.git From d44f0b6566e9cbef0c28e9479cf87759b9d53d12 Mon Sep 17 00:00:00 2001 From: Miki Tebeka Date: Thu, 14 Feb 2019 18:35:39 +0200 Subject: [PATCH 2/7] concat as parameter --- clients/py/v3io_frames/__init__.py | 7 +++++-- clients/py/v3io_frames/client.py | 6 +++++- clients/py/v3io_frames/grpc.py | 2 +- clients/py/v3io_frames/http.py | 2 +- clients/py/v3io_frames/pdutils.py | 9 ++------- 5 files changed, 14 insertions(+), 12 deletions(-) diff --git a/clients/py/v3io_frames/__init__.py b/clients/py/v3io_frames/__init__.py index 6246384b..b9297a75 100644 --- a/clients/py/v3io_frames/__init__.py +++ b/clients/py/v3io_frames/__init__.py @@ -36,7 +36,8 @@ def Client(address='', data_url='', container='', path='', user='', - password='', token='', session_id='', frame_factory=pd.DataFrame): + password='', token='', session_id='', frame_factory=pd.DataFrame, + concat=pd.concat): """Return a new client. Parameters @@ -60,6 +61,8 @@ def Client(address='', data_url='', container='', path='', user='', Session ID (session info) frame_factory : class DataFrame factory + concat : function + Function to concat DataFrames """ protocol = urlparse(address).scheme or 'grpc' if protocol not in _known_protocols: @@ -78,7 +81,7 @@ def Client(address='', data_url='', container='', path='', user='', ) cls = gRPCClient if protocol == 'grpc' else HTTPClient - return cls(address, session, frame_factory=frame_factory) + return cls(address, session, frame_factory=frame_factory, concat=concat) def session_from_env(): diff --git a/clients/py/v3io_frames/client.py b/clients/py/v3io_frames/client.py index 05471bb5..8cf9d5ae 100644 --- a/clients/py/v3io_frames/client.py +++ b/clients/py/v3io_frames/client.py @@ -24,7 +24,8 @@ class ClientBase: - def __init__(self, address, session, frame_factory=pd.DataFrame): + def __init__(self, address, session, frame_factory=pd.DataFrame, + concat=pd.concat): """Create new client Parameters @@ -35,6 +36,8 @@ def __init__(self, address, session, frame_factory=pd.DataFrame): Session object frame_factory : class DataFrame factory (currencly pandas and cudf supported) + concat : function + Function to concat DataFrames """ address = address or environ.get('V3IO_FRAMESD') if not address: @@ -42,6 +45,7 @@ def __init__(self, address, session, frame_factory=pd.DataFrame): self.address = self._fix_address(address) self.session = session self.frame_factory = frame_factory + self.concat = concat def read(self, backend='', table='', query='', columns=None, filter='', group_by='', limit=0, data_format='', row_layout=False, diff --git a/clients/py/v3io_frames/grpc.py b/clients/py/v3io_frames/grpc.py index b1899fb7..8f5f27cc 100644 --- a/clients/py/v3io_frames/grpc.py +++ b/clients/py/v3io_frames/grpc.py @@ -89,7 +89,7 @@ def _read(self, backend, table, query, columns, filter, group_by, limit, backend, table, query, columns, filter, group_by, limit, data_format, row_layout, max_in_message, marker, **kw) if not iterator: - return concat_dfs(dfs) + return concat_dfs(dfs, self.frame_factory, self.concat) return dfs @grpc_raise(WriteError) diff --git a/clients/py/v3io_frames/http.py b/clients/py/v3io_frames/http.py index e5e79e5a..34476817 100644 --- a/clients/py/v3io_frames/http.py +++ b/clients/py/v3io_frames/http.py @@ -87,7 +87,7 @@ def _read(self, backend, table, query, columns, filter, group_by, limit, dfs = self._iter_dfs(resp.raw) if not iterator: - return concat_dfs(dfs) + return concat_dfs(dfs, self.frame_factory, self.concat) return dfs @connection_error(WriteError) diff --git a/clients/py/v3io_frames/pdutils.py b/clients/py/v3io_frames/pdutils.py index 66596081..cc515c59 100644 --- a/clients/py/v3io_frames/pdutils.py +++ b/clients/py/v3io_frames/pdutils.py @@ -18,25 +18,20 @@ from .pbutils import is_categorical_dtype -def concat_dfs(dfs, frame_factory=pd.DataFrame): +def concat_dfs(dfs, frame_factory=pd.DataFrame, concat=pd.concat): """Concat sequence of DataFrames, can handle MultiIndex frames.""" dfs = list(dfs) if not dfs: return frame_factory() - if not isinstance(dfs[0], pd.DataFrame): - import cudf - return cudf.concat(dfs) - # Make sure concat keep categorical columns # See https://stackoverflow.com/a/44086708/7650 align_categories(dfs) names = list(dfs[0].index.names) - wdf = pd.concat( + wdf = concat( [df.reset_index() for df in dfs], ignore_index=True, - sort=False, ) if len(names) > 1: From 20703b5a28bec0b51c58fc02e14229a635a0d196 Mon Sep 17 00:00:00 2001 From: Miki Tebeka Date: Sun, 17 Feb 2019 10:14:30 +0200 Subject: [PATCH 3/7] ~ -> ${HOME} --- _scripts/install-cudf-env.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_scripts/install-cudf-env.sh b/_scripts/install-cudf-env.sh index 0f95afdb..d2ee2534 100755 --- a/_scripts/install-cudf-env.sh +++ b/_scripts/install-cudf-env.sh @@ -23,7 +23,7 @@ mv go goroot echo 'export GOROOT=${HOME}/goroot' >> ~/.bashrc echo 'export PATH=${GOROOT}/bin:${PATH}' >> ~/.bashrc -CONDA_INSTALL="~/miniconda3/bin/conda install -y" +CONDA_INSTALL="${HOME}/miniconda3/bin/conda install -y" # Install cudf ${CONDA_INSTALL} \ From 5f442bc87069c3fe18f920531349024781641c8d Mon Sep 17 00:00:00 2001 From: Miki Tebeka Date: Sun, 17 Feb 2019 11:25:02 +0200 Subject: [PATCH 4/7] concat --- _scripts/install-cudf-env.sh | 3 +++ clients/py/environment-cudf.yml | 12 ++++++++++++ clients/py/environment.yml | 7 +++++++ clients/py/tests/test_cudf.py | 14 ++++++++++++++ clients/py/v3io_frames/pdutils.py | 3 +-- 5 files changed, 37 insertions(+), 2 deletions(-) create mode 100644 clients/py/environment-cudf.yml create mode 100644 clients/py/environment.yml diff --git a/_scripts/install-cudf-env.sh b/_scripts/install-cudf-env.sh index d2ee2534..fc5e3214 100755 --- a/_scripts/install-cudf-env.sh +++ b/_scripts/install-cudf-env.sh @@ -37,3 +37,6 @@ ${CONDA_INSTALL} pytest pyyaml # Get frames code git clone https://github.com/v3io/frames.git + +# Install frames dependencies +conda install grpcio-tools=1.16.1 protobuf=3.6.1 requests=2.21.0 diff --git a/clients/py/environment-cudf.yml b/clients/py/environment-cudf.yml new file mode 100644 index 00000000..4331971c --- /dev/null +++ b/clients/py/environment-cudf.yml @@ -0,0 +1,12 @@ +channels: +- conda-forge +- defaults +- numba +- nvidia +- pytorch +- rapidsai +dependencies: +- cudf=0.5 +- cuml=0.5 +- python=3.6 +- cudatoolkit=9.2 diff --git a/clients/py/environment.yml b/clients/py/environment.yml new file mode 100644 index 00000000..de0e67eb --- /dev/null +++ b/clients/py/environment.yml @@ -0,0 +1,7 @@ +channels: +- defaults +dependencies: +- grpcio-tools=1.16.1 +- protobuf=3.6.1 +- requests=2.21.0 +- pandas>=0.23.* diff --git a/clients/py/tests/test_cudf.py b/clients/py/tests/test_cudf.py index 500028ad..5eeecd9a 100644 --- a/clients/py/tests/test_cudf.py +++ b/clients/py/tests/test_cudf.py @@ -14,6 +14,7 @@ from time import sleep, time +import pandas as pd import pytest import v3io_frames as v3f @@ -45,3 +46,16 @@ def test_cudf(framesd, session): assert isinstance(rdf, cudf.DataFrame), 'not a cudf.DataFrame' assert len(rdf) == len(df), 'wrong frame size' assert set(rdf.columns) == set(df.columns), 'columns mismatch' + + +@pytest.mark.skipif(not has_cudf, reason='cudf not found') +def test_concat_categorical(): + df1 = cudf.DataFrame({'a': range(10, 13), 'b': range(50, 53)}) + df1['c'] = pd.Series(['a']*3, dtype='category') + + df2 = cudf.DataFrame({'a': range(20, 23), 'b': range(60, 63)}) + df2['c'] = pd.Series(['b']*3, dtype='category') + + df = v3f.pdutils.concat_dfs([df1, df2], cudf.DataFrame, cudf.concat) + assert len(df) == len(df1) + len(df2), 'bad concat size' + assert v3f.pdutils.is_categorical_dtype(df['c']), 'result not categorical' diff --git a/clients/py/v3io_frames/pdutils.py b/clients/py/v3io_frames/pdutils.py index cc515c59..74ad667c 100644 --- a/clients/py/v3io_frames/pdutils.py +++ b/clients/py/v3io_frames/pdutils.py @@ -61,5 +61,4 @@ def align_categories(dfs): for df in dfs: for col in df.columns: if is_categorical_dtype(df[col].dtype): - cats = all_cats - set(df[col].cat.categories) - df[col].cat.add_categories(cats, inplace=True) + df[col] = df[col].cat.set_categories(all_cats) From d6b23ea8d068442f3f12331eea7531bdeb206e9a Mon Sep 17 00:00:00 2001 From: Miki Tebeka Date: Sun, 17 Feb 2019 11:29:47 +0200 Subject: [PATCH 5/7] no index.names in cudf --- clients/py/v3io_frames/pdutils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/clients/py/v3io_frames/pdutils.py b/clients/py/v3io_frames/pdutils.py index 74ad667c..ab44b9f9 100644 --- a/clients/py/v3io_frames/pdutils.py +++ b/clients/py/v3io_frames/pdutils.py @@ -28,7 +28,11 @@ def concat_dfs(dfs, frame_factory=pd.DataFrame, concat=pd.concat): # See https://stackoverflow.com/a/44086708/7650 align_categories(dfs) - names = list(dfs[0].index.names) + if hasattr(dfs[0].index, 'names'): + names = list(dfs[0].index.names) + else: + names = [dfs[0].index.name] + wdf = concat( [df.reset_index() for df in dfs], ignore_index=True, From 69c86bf20c05cef71fa4979814f373f1a4a30bfd Mon Sep 17 00:00:00 2001 From: Miki Tebeka Date: Sun, 17 Feb 2019 11:49:20 +0200 Subject: [PATCH 6/7] index might be there --- clients/py/v3io_frames/pdutils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/clients/py/v3io_frames/pdutils.py b/clients/py/v3io_frames/pdutils.py index ab44b9f9..06baa7fb 100644 --- a/clients/py/v3io_frames/pdutils.py +++ b/clients/py/v3io_frames/pdutils.py @@ -32,6 +32,7 @@ def concat_dfs(dfs, frame_factory=pd.DataFrame, concat=pd.concat): names = list(dfs[0].index.names) else: names = [dfs[0].index.name] + had_index = 'index' in dfs[0].columns wdf = concat( [df.reset_index() for df in dfs], @@ -47,7 +48,8 @@ def concat_dfs(dfs, frame_factory=pd.DataFrame, concat=pd.concat): elif names[0]: wdf = wdf.set_index(names[0]) elif names[0] is None: - del wdf['index'] # Pandas will add 'index' column + if not had_index and 'index' in wdf.columns: + del wdf['index'] # Pandas will add 'index' column with warnings.catch_warnings(): warnings.simplefilter('ignore') From f8a945a523e10252fab2af9139debf1130931ae8 Mon Sep 17 00:00:00 2001 From: Miki Tebeka Date: Sun, 17 Feb 2019 11:52:44 +0200 Subject: [PATCH 7/7] oops --- clients/py/tests/test_cudf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/clients/py/tests/test_cudf.py b/clients/py/tests/test_cudf.py index 5eeecd9a..2ce3ab7f 100644 --- a/clients/py/tests/test_cudf.py +++ b/clients/py/tests/test_cudf.py @@ -58,4 +58,5 @@ def test_concat_categorical(): df = v3f.pdutils.concat_dfs([df1, df2], cudf.DataFrame, cudf.concat) assert len(df) == len(df1) + len(df2), 'bad concat size' - assert v3f.pdutils.is_categorical_dtype(df['c']), 'result not categorical' + dtype = df['c'].dtype + assert v3f.pdutils.is_categorical_dtype(dtype), 'result not categorical'