From 265e496d0cfb2f8c5f270bd9d59d00ee106032dd Mon Sep 17 00:00:00 2001 From: Haifeng Jin Date: Sun, 27 Sep 2020 01:36:02 -0500 Subject: [PATCH] Docs (#1355) * fix broken tutorials * Update docs for 1.0.9 release. --- docs/py/customized.py | 2 +- docs/py/export.py | 2 +- docs/py/image_classification.py | 8 +- docs/py/image_regression.py | 4 +- docs/py/load.py | 169 ++++++++++++++++++++++ docs/py/multi.py | 2 +- docs/py/structured_data_classification.py | 14 +- docs/py/structured_data_regression.py | 4 +- docs/py/text_classification.py | 55 ++++--- docs/py/text_regression.py | 47 +++--- docs/templates/install.md | 4 +- tests/performance.py | 4 +- tests/utils.py | 44 +++--- 13 files changed, 267 insertions(+), 92 deletions(-) create mode 100644 docs/py/load.py diff --git a/docs/py/customized.py b/docs/py/customized.py index 8b58f9bf3..c3b53dcfa 100644 --- a/docs/py/customized.py +++ b/docs/py/customized.py @@ -1,6 +1,6 @@ """shell pip install autokeras -pip install git+https://github.com/keras-team/keras-tuner.git@1.0.2rc1 +pip install git+https://github.com/keras-team/keras-tuner.git@1.0.2rc2 """ """ diff --git a/docs/py/export.py b/docs/py/export.py index 8ce07695a..652b541a2 100644 --- a/docs/py/export.py +++ b/docs/py/export.py @@ -9,7 +9,7 @@ """shell pip install autokeras -pip install git+https://github.com/keras-team/keras-tuner.git@1.0.2rc1 +pip install git+https://github.com/keras-team/keras-tuner.git@1.0.2rc2 """ import tensorflow as tf diff --git a/docs/py/image_classification.py b/docs/py/image_classification.py index cc357fb3b..8fba946fe 100644 --- a/docs/py/image_classification.py +++ b/docs/py/image_classification.py @@ -1,6 +1,6 @@ """shell pip install autokeras -pip install git+https://github.com/keras-team/keras-tuner.git@1.0.2rc1 +pip install git+https://github.com/keras-team/keras-tuner.git@1.0.2rc2 """ """ @@ -22,6 +22,8 @@ The second step is to run the ImageClassifier. It is recommended have more trials for more complicated datasets. This is just a quick demo of MNIST, so we set max_trials to 1. +For the same reason, we set epochs to 10. +You can also leave the epochs unspecified for an adaptive number of epochs. """ # Initialize the image classifier. @@ -155,9 +157,7 @@ # [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]]) """ -We also support using tf.data.Dataset format for the training data. In this case, the -images would have to be 3-dimentional. The labels have to be one-hot encoded for -multi-class classification to be wrapped into tensorflow Dataset. +We also support using tf.data.Dataset format for the training data. """ train_set = tf.data.Dataset.from_tensor_slices(((x_train,), (y_train,))) diff --git a/docs/py/image_regression.py b/docs/py/image_regression.py index 1d93b6f67..fd1ea0385 100644 --- a/docs/py/image_regression.py +++ b/docs/py/image_regression.py @@ -1,6 +1,6 @@ """shell pip install autokeras -pip install git+https://github.com/keras-team/keras-tuner.git@1.0.2rc1 +pip install git+https://github.com/keras-team/keras-tuner.git@1.0.2rc2 """ """ @@ -29,6 +29,8 @@ The second step is to run the ImageRegressor. It is recommended have more trials for more complicated datasets. This is just a quick demo of MNIST, so we set max_trials to 1. +For the same reason, we set epochs to 2. +You can also leave the epochs unspecified for an adaptive number of epochs. """ # Initialize the image regressor. diff --git a/docs/py/load.py b/docs/py/load.py new file mode 100644 index 000000000..79734311c --- /dev/null +++ b/docs/py/load.py @@ -0,0 +1,169 @@ +"""shell +pip install autokeras +pip install git+https://github.com/keras-team/keras-tuner.git@1.0.2rc2 +""" + +""" +If the data is too large to put in memory all at once, we can load it batch by batch into memory from disk with tf.data.Dataset. +This [function](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/image_dataset_from_directory) can help you build such a tf.data.Dataset for image data. + +First, we download the data and extract the files. +""" + +import tensorflow as tf +import os + +# dataset_url = "https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz" +# local_file_path = tf.keras.utils.get_file(origin=dataset_url, + # fname='image_data', + # extract=True) +# # The file is extracted in the same directory as the downloaded file. +# local_dir_path = os.path.dirname(local_file_path) +# # After check mannually, we know the extracted data is in 'flower_photos'. +# data_dir = os.path.join(local_dir_path, 'flower_photos') +# print(data_dir) + +""" +The directory should look like this. Each folder contains the images in the same class. + +``` +flowers_photos/ + daisy/ + dandelion/ + roses/ + sunflowers/ + tulips/ +``` + +We can split the data into training and testing as we load them. +""" + +batch_size = 32 +img_height = 180 +img_width = 180 + +# train_data = tf.keras.preprocessing.image_dataset_from_directory( + # data_dir, + # # Use 20% data as testing data. + # validation_split=0.2, + # subset="training", + # # Set seed to ensure the same split when loading testing data. + # seed=123, + # image_size=(img_height, img_width), + # batch_size=batch_size) + +# test_data = tf.keras.preprocessing.image_dataset_from_directory( + # data_dir, + # validation_split=0.2, + # subset="validation", + # seed=123, + # image_size=(img_height, img_width), + # batch_size=batch_size) + +""" +Then we just do one quick demo of AutoKeras to make sure the dataset works. +""" + +import autokeras as ak + +# clf = ak.ImageClassifier(overwrite=True, max_trials=1) +# clf.fit(train_data, epochs=1) +# print(clf.evaluate(test_data)) + +""" +You can also load text datasets in the same way. +""" + +dataset_url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz" + +local_file_path = tf.keras.utils.get_file( + fname="text_data", + origin=dataset_url, + extract=True, +) +# The file is extracted in the same directory as the downloaded file. +local_dir_path = os.path.dirname(local_file_path) +# After check mannually, we know the extracted data is in 'aclImdb'. +data_dir = os.path.join(local_dir_path, 'aclImdb') +# Remove the unused data folder. +import shutil +shutil.rmtree(os.path.join(data_dir, 'train/unsup')) + + +""" +For this dataset, the data is already split into train and test. +We just load them separately. +""" +print(data_dir) +train_data = tf.keras.preprocessing.text_dataset_from_directory( + os.path.join(data_dir, 'train'), + class_names=['pos', 'neg'], + validation_split=0.2, + subset="training", + # shuffle=False, + seed=123, + batch_size=batch_size) + +val_data = tf.keras.preprocessing.text_dataset_from_directory( + os.path.join(data_dir, 'train'), + class_names=['pos', 'neg'], + validation_split=0.2, + subset="validation", + # shuffle=False, + seed=123, + batch_size=batch_size) + +test_data = tf.keras.preprocessing.text_dataset_from_directory( + os.path.join(data_dir, 'test'), + class_names=['pos', 'neg'], + shuffle=False, + batch_size=batch_size) + +for x, y in train_data: + print(x.numpy()[0]) + print(y.numpy()[0]) + # record_x = x.numpy() + # record_y = y.numpy() + break + +for x, y in train_data: + print(x.numpy()[0]) + print(y.numpy()[0]) + break + +# train_data = tf.keras.preprocessing.text_dataset_from_directory( + # os.path.join(data_dir, 'train'), + # class_names=['pos', 'neg'], + # shuffle=True, + # seed=123, + # batch_size=batch_size) + +# for x, y in train_data: + # for i, a in enumerate(x.numpy()): + # for j, b in enumerate(record_x): + # if a == b: + # print('*') + # assert record_y[j] == y.numpy()[i] + +# import numpy as np +# x_train = [] +# y_train = [] +# for x, y in train_data: + # for a in x.numpy(): + # x_train.append(a) + # for a in y.numpy(): + # y_train.append(a) + +# x_train = np.array(x_train) +# y_train = np.array(y_train) + +# train_data = train_data.shuffle(1000, seed=123, reshuffle_each_iteration=False) + + +clf = ak.TextClassifier(overwrite=True, max_trials=2) +# clf.fit(train_data, validation_data=test_data) +# clf.fit(train_data, validation_data=train_data) +clf.fit(train_data, validation_data=val_data) +# clf.fit(x_train, y_train) +# clf.fit(train_data) +print(clf.evaluate(test_data)) diff --git a/docs/py/multi.py b/docs/py/multi.py index e9b95615f..a9809405e 100644 --- a/docs/py/multi.py +++ b/docs/py/multi.py @@ -1,6 +1,6 @@ """shell pip install autokeras -pip install git+https://github.com/keras-team/keras-tuner.git@1.0.2rc1 +pip install git+https://github.com/keras-team/keras-tuner.git@1.0.2rc2 """ """ diff --git a/docs/py/structured_data_classification.py b/docs/py/structured_data_classification.py index dbdb1f3ef..8b117f640 100644 --- a/docs/py/structured_data_classification.py +++ b/docs/py/structured_data_classification.py @@ -1,6 +1,6 @@ """shell pip install autokeras -pip install git+https://github.com/keras-team/keras-tuner.git@1.0.2rc1 +pip install git+https://github.com/keras-team/keras-tuner.git@1.0.2rc2 """ """ @@ -21,6 +21,8 @@ """ The second step is to run the [StructuredDataClassifier](/structured_data_classifier). +As a quick demo, we set epochs to 10. +You can also leave the epochs unspecified for an adaptive number of epochs. """ # Initialize the structured data classifier. @@ -49,7 +51,7 @@ two-dimensional with numerical or categorical values. For the classification labels, -AutoKeras accepts both plain labels, i.e. strings or integers, and one-hot encoded +AutoKeras accepts both plain labels, i.e. strings or integers, and one-hot encoded encoded labels, i.e. vectors of 0s and 1s. The labels can be numpy.ndarray, pandas.DataFrame, or pandas.Series. @@ -70,7 +72,7 @@ print(type(y_train)) # pandas.DataFrame # You can also use numpy.ndarray for x_train and y_train. -x_train = x_train.to_numpy().astype(np.unicode) +x_train = x_train.to_numpy() y_train = y_train.to_numpy() print(type(x_train)) # numpy.ndarray print(type(y_train)) # numpy.ndarray @@ -92,13 +94,9 @@ """ The following code shows how to convert numpy.ndarray to tf.data.Dataset. -Notably, the labels have to be one-hot encoded for multi-class -classification to be wrapped into tensorflow Dataset. -Since the Titanic dataset is binary -classification, it should not be one-hot encoded. """ -train_set = tf.data.Dataset.from_tensor_slices((x_train, y_train)) +train_set = tf.data.Dataset.from_tensor_slices((x_train.astype(np.unicode), y_train)) test_set = tf.data.Dataset.from_tensor_slices((x_test.to_numpy().astype(np.unicode), y_test)) clf = ak.StructuredDataClassifier( diff --git a/docs/py/structured_data_regression.py b/docs/py/structured_data_regression.py index d7abdb39a..f256ed78a 100644 --- a/docs/py/structured_data_regression.py +++ b/docs/py/structured_data_regression.py @@ -1,6 +1,6 @@ """shell pip install autokeras -pip install git+https://github.com/keras-team/keras-tuner.git@1.0.2rc1 +pip install git+https://github.com/keras-team/keras-tuner.git@1.0.2rc2 """ """ @@ -31,6 +31,8 @@ """ The second step is to run the [StructuredDataRegressor](/structured_data_regressor). +As a quick demo, we set epochs to 10. +You can also leave the epochs unspecified for an adaptive number of epochs. """ # Initialize the structured data regressor. diff --git a/docs/py/text_classification.py b/docs/py/text_classification.py index c7e647c56..509a80359 100644 --- a/docs/py/text_classification.py +++ b/docs/py/text_classification.py @@ -1,6 +1,6 @@ """shell pip install autokeras -pip install git+https://github.com/keras-team/keras-tuner.git@1.0.2rc1 +pip install git+https://github.com/keras-team/keras-tuner.git@1.0.2rc2 """ """ @@ -10,35 +10,38 @@ an example. """ +import os import numpy as np +import tensorflow as tf from tensorflow.keras.datasets import imdb +from sklearn.datasets import load_files + +dataset = tf.keras.utils.get_file( + fname="aclImdb.tar.gz", + origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", + extract=True, +) + +# set path to dataset +IMDB_DATADIR = os.path.join(os.path.dirname(dataset), 'aclImdb') + +classes = ['pos', 'neg'] +train_data = load_files(os.path.join(IMDB_DATADIR, 'train'), shuffle=True, categories=classes) +test_data = load_files(os.path.join(IMDB_DATADIR, 'test'), shuffle=False, categories=classes) + +x_train = np.array(train_data.data) +y_train = np.array(train_data.target) +x_test = np.array(test_data.data) +y_test = np.array(test_data.target) -# Load the integer sequence the IMDB dataset with Keras. -index_offset = 3 # word index offset -(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=1000, - index_from=index_offset) -y_train = y_train.reshape(-1, 1) -y_test = y_test.reshape(-1, 1) -# Prepare the dictionary of index to word. -word_to_id = imdb.get_word_index() -word_to_id = {k: (v + index_offset) for k, v in word_to_id.items()} -word_to_id[""] = 0 -word_to_id[""] = 1 -word_to_id[""] = 2 -id_to_word = {value: key for key, value in word_to_id.items()} -# Convert the word indices to words. -x_train = list(map(lambda sentence: ' '.join( - id_to_word[i] for i in sentence), x_train)) -x_test = list(map(lambda sentence: ' '.join( - id_to_word[i] for i in sentence), x_test)) -x_train = np.array(x_train, dtype=np.str) -x_test = np.array(x_test, dtype=np.str) print(x_train.shape) # (25000,) print(y_train.shape) # (25000, 1) -print(x_train[0][:50]) # this film was just brilliant casting +print(x_train[0][:50]) # this film was just brilliant casting """ The second step is to run the [TextClassifier](/text_classifier). +As a quick demo, we set epochs to 2. +You can also leave the epochs unspecified for an adaptive number of epochs. """ import autokeras as ak @@ -46,7 +49,7 @@ # Initialize the text classifier. clf = ak.TextClassifier( overwrite=True, - max_trials=1) # It tries 10 different models. + max_trials=1) # It only tries 1 model as a quick demo. # Feed the text classifier with training data. clf.fit(x_train, y_train, epochs=2) # Predict with the best model. @@ -146,18 +149,14 @@ We also support using [tf.data.Dataset]( https://www.tensorflow.org/api_docs/python/tf/data/Dataset?version=stable) format for the training data. -The labels have to be one-hot encoded for multi-class -classification to be wrapped into tensorflow Dataset. -Since the IMDB dataset is binary classification, it should not be one-hot encoded. """ -import tensorflow as tf train_set = tf.data.Dataset.from_tensor_slices(((x_train, ), (y_train, ))).batch(32) test_set = tf.data.Dataset.from_tensor_slices(((x_test, ), (y_test, ))).batch(32) clf = ak.TextClassifier( overwrite=True, - max_trials=3) + max_trials=2) # Feed the tensorflow Dataset to the classifier. clf.fit(train_set, epochs=2) # Predict with the best model. diff --git a/docs/py/text_regression.py b/docs/py/text_regression.py index 809755cbb..f3e96e9c6 100644 --- a/docs/py/text_regression.py +++ b/docs/py/text_regression.py @@ -1,6 +1,6 @@ """shell pip install autokeras -pip install git+https://github.com/keras-team/keras-tuner.git@1.0.2rc1 +pip install git+https://github.com/keras-team/keras-tuner.git@1.0.2rc2 """ """ @@ -14,35 +14,38 @@ an example. """ +import os import numpy as np +import tensorflow as tf from tensorflow.keras.datasets import imdb +from sklearn.datasets import load_files + +dataset = tf.keras.utils.get_file( + fname="aclImdb.tar.gz", + origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", + extract=True, +) + +# set path to dataset +IMDB_DATADIR = os.path.join(os.path.dirname(dataset), 'aclImdb') + +classes = ['pos', 'neg'] +train_data = load_files(os.path.join(IMDB_DATADIR, 'train'), shuffle=True, categories=classes) +test_data = load_files(os.path.join(IMDB_DATADIR, 'test'), shuffle=False, categories=classes) + +x_train = np.array(train_data.data) +y_train = np.array(train_data.target) +x_test = np.array(test_data.data) +y_test = np.array(test_data.target) -# Load the integer sequence the IMDB dataset with Keras. -index_offset = 3 # word index offset -(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=1000, - index_from=index_offset) -y_train = y_train.reshape(-1, 1) -y_test = y_test.reshape(-1, 1) -# Prepare the dictionary of index to word. -word_to_id = imdb.get_word_index() -word_to_id = {k: (v + index_offset) for k, v in word_to_id.items()} -word_to_id[""] = 0 -word_to_id[""] = 1 -word_to_id[""] = 2 -id_to_word = {value: key for key, value in word_to_id.items()} -# Convert the word indices to words. -x_train = list(map(lambda sentence: ' '.join( - id_to_word[i] for i in sentence), x_train)) -x_test = list(map(lambda sentence: ' '.join( - id_to_word[i] for i in sentence), x_test)) -x_train = np.array(x_train, dtype=np.str) -x_test = np.array(x_test, dtype=np.str) print(x_train.shape) # (25000,) print(y_train.shape) # (25000, 1) print(x_train[0][:50]) # this film was just brilliant casting """ The second step is to run the [TextRegressor](/text_regressor). +As a quick demo, we set epochs to 2. +You can also leave the epochs unspecified for an adaptive number of epochs. """ import autokeras as ak @@ -158,7 +161,7 @@ reg = ak.TextRegressor( overwrite=True, - max_trials=3) + max_trials=2) # Feed the tensorflow Dataset to the regressor. reg.fit(train_set, epochs=2) # Predict with the best model. diff --git a/docs/templates/install.md b/docs/templates/install.md index 192769ee4..e6cb357c5 100644 --- a/docs/templates/install.md +++ b/docs/templates/install.md @@ -17,14 +17,14 @@ AutoKeras only support **Python 3**. If you followed previous steps to use virtualenv to install tensorflow, you can just activate the virtualenv and use the following command to install AutoKeras. ``` -pip install git+https://github.com/keras-team/keras-tuner.git@1.0.2rc1 +pip install git+https://github.com/keras-team/keras-tuner.git@1.0.2rc2 pip install autokeras ``` If you did not use virtualenv, and you use `python3` command to execute your python program, please use the following command to install AutoKeras. ``` -python3 -m pip install git+https://github.com/keras-team/keras-tuner.git@1.0.2rc1 +python3 -m pip install git+https://github.com/keras-team/keras-tuner.git@1.0.2rc2 python3 -m pip install autokeras ``` diff --git a/tests/performance.py b/tests/performance.py index 010e956fa..ee4f2dfec 100644 --- a/tests/performance.py +++ b/tests/performance.py @@ -36,10 +36,10 @@ def test_cifar10_accuracy_over_93(tmp_path): assert accuracy >= 0.93 -def test_imdb_accuracy_over_84(tmp_path): +def test_imdb_accuracy_over_92(tmp_path): (x_train, y_train), (x_test, y_test) = utils.imdb_raw(num_instances=None) clf = ak.TextClassifier(max_trials=3, directory=tmp_path) - clf.fit(x_train, y_train, epochs=2) + clf.fit(x_train, y_train, batch_size=6, epochs=1) accuracy = clf.evaluate(x_test, y_test)[1] assert accuracy >= 0.92 diff --git a/tests/utils.py b/tests/utils.py index 564bc5c50..92eb6aa9e 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -17,6 +17,7 @@ import numpy as np import tensorflow as tf +from sklearn.datasets import load_files import autokeras as ak @@ -76,32 +77,33 @@ def generate_one_hot_labels(num_instances=100, num_classes=10, dtype="np"): def imdb_raw(num_instances=100): - index_offset = 3 # word index offset - - (x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data( - num_words=1000, index_from=index_offset + dataset = tf.keras.utils.get_file( + fname="aclImdb.tar.gz", + origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", + extract=True, ) - if num_instances is not None: - x_train = x_train[:num_instances] - y_train = y_train[:num_instances].reshape(-1, 1) - x_test = x_test[:num_instances] - y_test = y_test[:num_instances].reshape(-1, 1) - word_to_id = tf.keras.datasets.imdb.get_word_index() - word_to_id = {k: (v + index_offset) for k, v in word_to_id.items()} - word_to_id[""] = 0 - word_to_id[""] = 1 - word_to_id[""] = 2 + # set path to dataset + IMDB_DATADIR = os.path.join(os.path.dirname(dataset), "aclImdb") - id_to_word = {value: key for key, value in word_to_id.items()} - x_train = list( - map(lambda sentence: " ".join(id_to_word[i] for i in sentence), x_train) + classes = ["pos", "neg"] + train_data = load_files( + os.path.join(IMDB_DATADIR, "train"), shuffle=True, categories=classes ) - x_test = list( - map(lambda sentence: " ".join(id_to_word[i] for i in sentence), x_test) + test_data = load_files( + os.path.join(IMDB_DATADIR, "test"), shuffle=False, categories=classes ) - x_train = np.array(x_train, dtype=np.str) - x_test = np.array(x_test, dtype=np.str) + + x_train = np.array(train_data.data) + y_train = np.array(train_data.target) + x_test = np.array(test_data.data) + y_test = np.array(test_data.target) + + if num_instances is not None: + x_train = x_train[:num_instances] + y_train = y_train[:num_instances] + x_test = x_test[:num_instances] + y_test = y_test[:num_instances] return (x_train, y_train), (x_test, y_test)