Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for timestamp datatype #106

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,13 @@ A template datatype can be found in `keras_pandas/data_types/Abstract.py`. Filli
To create add a new datatype:

- Create a new `.py` file in `keras_pandas/data_types`, based on `keras_pandas/data_types/Abstract.py` (and perhaps
referencing `keras_pandas/data_types/Numerical.py`)
following `keras_pandas/data_types/Numerical.py` as an example)
- Fill out your new datatype's `.py` file
- Create a new test class for your new datatype (perhaps based on `tests/testDatatypeTemplate.py` and / or
`tests/testNumerical.py`)
- Add the new datatype to `keras_pandas/Automater.datatype_handlers`, in `keras_pandas/Automater.__init__()`
- Add the new datatype to `docs/index.rst`, in `autosummary list`
- Add the new datatype to `docs/index.rst`, in `autosummary list`
- (Optional) Update examples in `examples/` to utilize your new datatype

## Adding new examples

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ board games with my partner in Seattle.

### Development

- There's nothing here! (yet)
- Support for timestamp datatype (#106)

### 3.1.0

Expand Down
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ API
data_types.Numerical.Numerical
data_types.Text.Text
data_types.TimeSeries.TimeSeries
data_types.Timestamp.Timestamp
lib
transformations

3 changes: 2 additions & 1 deletion examples/instanbul_predict_ise.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ def main():
data_type_dict = {'numerical': ['ise', 'ise.1', 'sp', 'dax', 'ftse', 'nikkei', 'bovespa', 'eu', 'em'],
'categorical': [],
'text': [],
'timeseries': ['ise_lagged', 'ise.1_lagged', 'sp_lagged', 'dax_lagged']}
'timeseries': ['ise_lagged', 'ise.1_lagged', 'sp_lagged', 'dax_lagged'],
'timestamp': ['date']}
output_var = 'ise'

# Create and fit Automater
Expand Down
3 changes: 2 additions & 1 deletion examples/lending_club_predict_dti.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ def main():
'total_acc', 'pub_rec_bankruptcies'],
'categorical': ['term', 'grade', 'emp_length', 'home_ownership', 'addr_state',
'application_type', 'disbursement_method'],
'text': ['desc', 'purpose', 'title']}
'text': ['desc', 'purpose', 'title'],
'timestamp': ['last_credit_pull_d', 'issue_d']}
output_var = 'dti'

# Create and fit Automater
Expand Down
3 changes: 2 additions & 1 deletion examples/lending_club_predict_loan_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ def main():
'total_acc', 'pub_rec_bankruptcies'],
'categorical': ['term', 'grade', 'emp_length', 'home_ownership', 'loan_status', 'addr_state',
'application_type', 'disbursement_method'],
'text': ['desc', 'purpose', 'title']}
'text': ['desc', 'purpose', 'title'],
'timestamp': ['last_credit_pull_d', 'issue_d']}
output_var = 'loan_status'

# Create and fit Automater
Expand Down
10 changes: 7 additions & 3 deletions keras_pandas/Automater.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from keras_pandas.data_types.Numerical import Numerical
from keras_pandas.data_types.Text import Text
from keras_pandas.data_types.TimeSeries import TimeSeries
from keras_pandas.data_types.Timestamp import Timestamp


class Automater():
Expand Down Expand Up @@ -49,11 +50,14 @@ def __init__(self, data_type_dict=dict(), output_var=None, datatype_handlers=dic
self.supervised = self.output_var is not None

# Set up datatype handlers
self.datatype_handlers = {'numerical': Numerical(),
self.datatype_handlers = {'boolean': Boolean(),
'categorical': Categorical(),
'boolean': Boolean(),
'numerical': Numerical(),
'text': Text(),
'timeseries': TimeSeries(),
'text': Text()}
'timestamp': Timestamp()
}


# Add user-supplied datatype handlers
self.datatype_handlers.update(datatype_handlers)
Expand Down
3 changes: 2 additions & 1 deletion keras_pandas/data_types/Abstract.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
class AbstractDatatype():
"""
Interface for all future datatypes
Support for ABSTRACT variables, such as EXAMPLE1: `[175000, 105000, 30000000]`, or EXAMPLE2: `[
1, 7, 22, 183, 12]`.
"""

def __init__(self):
Expand Down
5 changes: 3 additions & 2 deletions keras_pandas/data_types/Numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

import keras
from keras.layers import Dense
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from keras_pandas import lib

Expand All @@ -15,7 +16,7 @@ class Numerical():

def __init__(self):
self.supports_output = True
self.default_transformation_pipeline = [Imputer(strategy='mean'), StandardScaler()]
self.default_transformation_pipeline = [SimpleImputer(strategy='mean'), StandardScaler()]

@staticmethod
def input_nub_generator(variable, transformed_observations):
Expand Down
102 changes: 102 additions & 0 deletions keras_pandas/data_types/Timestamp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
from keras import Input
from keras.layers import Dense
from sklearn.impute import SimpleImputer

from keras_pandas import lib
from keras_pandas.transformations import TypeConversionEncoder, TimestampVectorizer

class Timestamp():
"""
Support for timestamp variables, such as date_of_birth: `['1992-01-24', 2018-12-28', '1991-10-29']`,
or purchase_timestamp: `['December 29, 2018 1:53:05 AM', 'January 24, 1992 1:53:05 AM',
'January 1, 1970 12:00:42 AM']`.
"""

def __init__(self):
self.supports_output = False
self.default_transformation_pipeline = [TypeConversionEncoder(str), TimestampVectorizer(),
SimpleImputer(strategy='constant', fill_value=0)]

def input_nub_generator(self, variable, transformed_observations):
"""
Generate an input layer and input 'nub' for a Keras network.

- input_layer: The input layer accepts data from the outside world.
- input_nub: The input nub will always include the input_layer as its first layer. It may also include
other layers for handling the data type in specific ways

:param variable: Name of the variable
:type variable: str
:param transformed_obervations: A dataframe, containing either the specified variable, or derived variables
:type transformed_obervations: pandas.DataFrame
:return: A tuple containing the input layer, and the last layer of the nub
"""

# Get transformed data for shaping
if variable in transformed_observations.columns:
variable_list = [variable]
else:
variable_name_prefix = variable + '_'
variable_list = list(
filter(lambda x: x.startswith(variable_name_prefix),
transformed_observations.columns))
transformed = transformed_observations[variable_list].as_matrix()

# Set up dimensions for input_layer layer
if len(transformed.shape) >= 2:
input_sequence_length = int(transformed.shape[1])
else:
input_sequence_length = 1

num_dense_units = int(min((input_sequence_length + 1) / 2, 10))

input_layer = Input(shape=(input_sequence_length,),
name=lib.namespace_conversion('input_{}'.format(variable)))
x = input_layer
x = Dense(num_dense_units)(x)
input_nub = x

return input_layer, input_nub

def output_nub_generator(self, variable, input_observations):
"""
Generate an output layer for a Keras network.

- output_layer: A keras layer, which is formatted to correctly accept the response variable

:param variable: A Variable contained in the input_df
:type variable: str
:param input_observations: A dataframe, containing either the specified variable, or derived variables
:type input_observations: pandas.DataFrame
:return: output_layer
"""
self._check_output_support()
output_nub = None

return output_nub

def output_inverse_transform(self, y_pred, response_transform_pipeline):
"""
Undo the transforming that was done to get data into a keras model. This inverse transformation will
render the observations so they can be compared to the data in the natural scale provided by the user
:param response_transform_pipeline: An SKLearn transformation pipeline, trained on the same variable as the
model which produced y_pred
:param y_pred: The data predicted by keras
:return: The same data, in the natural basis
"""
self._check_output_support()
natural_scaled_vars = None

return natural_scaled_vars

def output_suggested_loss(self):
self._check_output_support()
suggested_loss = None
return suggested_loss

def _check_output_support(self):
if not self.supports_output:

raise ValueError('This datatype: {} does not support output, but has called to an output related '
'function.'.format(self.__class__))
return True
4 changes: 2 additions & 2 deletions keras_pandas/lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,15 +169,15 @@ def load_lending_club(test_run=True):
if test_run:
observations = observations.sample(300)

logging.info('Available lending club columns: {}'.format(observations.columns))
logging.info('Available lending club columns: {}'.format(list(observations.columns)))
return observations


def load_instanbul_stocks(as_ts=False):
logging.info('Loading Instanbul data')
file_path = download_file('https://archive.ics.uci.edu/ml/machine-learning-databases/00247/data_akbilgic.xlsx',
'~/.keras-pandas/example_datasets/',
filename='instanbul_stocks.xlsw')
filename='instanbul_stocks.xlsx')
logging.info('Reading data from filepath: {}'.format(file_path))

observations = pandas.read_excel(file_path, header=1)
Expand Down
86 changes: 86 additions & 0 deletions keras_pandas/transformations.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
"""
SKLearn-compliant transformers, for use as part of pipelines
"""
import datetime
import logging
from collections import defaultdict

import dateinfer
import numpy
import pandas
from gensim.utils import simple_preprocess
Expand Down Expand Up @@ -459,3 +461,87 @@ def transform(self, X):
# Pad all of the sequences to be the same length
X = pad_sequences(X, maxlen=self.max_sequence_length)
return X


class TimestampVectorizer(TransformerMixin, BaseEstimator):
"""
Convert a timestamp string into usable information by:

- Determining timestamp format string (fit only)
- Converting string into timestamp (or NaN if not possible), using timestamp format string
- Convert timestamp to Unix Epoch time
- Calculating sine / cosine values, with frequencies described by `frequency_labels`
"""

def __init__(self, frequency_labels=['minutely', 'hourly', 'daily', 'weekly', 'monthly', 'quarterly', 'yearly']):
self.frequency_labels = frequency_labels
self.trig_periods = self.frequency_labels_to_trig_periods(self.frequency_labels)
self.strptime_format = None

def fit(self, X, y=None):
if self.strptime_format is None:
logging.info('No self.strptime_format set. Inferring datetime format')
self.strptime_format = dateinfer.infer(list(X[0]))
logging.info('Inferred datetime format: {}'.format(self.strptime_format))
else:
logging.info('self.strptime_format previously set. Inferred datetime format: {}'.format(
self.strptime_format))
return self

def transform(self, X):

# Convert from string to epoch time
f = numpy.vectorize(lambda x: self.string_to_epoch(x, self.strptime_format))
X = f(X)

# Convert to appropriately scaled trig inputs
X = self.trig_periods * X

# Apply sine / cosine
X_sin = numpy.sin(X)
X_cos = numpy.cos(X)

X = numpy.concatenate([X_sin, X_cos], axis=1)
return X

@staticmethod
def string_to_epoch(value, strptime_format):

try:
# Convert from string to datetime
value = datetime.datetime.strptime(value, strptime_format)

# Attempt to convert to epoch
value = (value - datetime.datetime(1970, 1, 1)).total_seconds()

return value

except:
return numpy.nan

@staticmethod
def frequency_labels_to_trig_periods(frequency_labels):

conversions = {
'minutely': 60,
'hourly': 3600,
'daily': 86400,
'weekly': 604800,
'monthly': 2628288,
'quarterly': 7883991,
'yearly': 31535965
}

# Check for unknown initial frequency_labels
for initial_frequency in frequency_labels:
if initial_frequency not in conversions:
raise AssertionError('Unknown initial frequency: {}. Please choose from: {}'.format(initial_frequency,
conversions.keys()))

# Convert initial frequency_labels to duration in seconds
initial_frequencies_seconds = list(map(lambda x: conversions[x], frequency_labels))

# Convert into trig_periods
trig_periods = list(map(lambda x: (2 * numpy.pi) / x, initial_frequencies_seconds))

return trig_periods
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
git+https://github.com/jeffreystarr/dateinfer.git@refs/pull/12/head
gensim==3.6.0
h5py==2.8.0
Keras==2.2.4
Expand Down
2 changes: 1 addition & 1 deletion tests/testDatatypeTemplate.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from tests.testbase import TestBase


class TestNumerical(TestBase):
class TestDatatypeTemplate(TestBase):

def test_init(self):
# TODO Create datatype
Expand Down
Loading