From 07f6bda31f4e223e80e7756cba424ab907e6e29e Mon Sep 17 00:00:00 2001 From: Stefan van der Walt Date: Tue, 6 Oct 2015 18:39:46 -0700 Subject: [PATCH 01/16] Add preliminary version of tutorial --- docs/conf.py | 2 ++ docs/tutorial.rst | 52 ++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index 34943d339..2bec3ace8 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -35,6 +35,7 @@ 'sphinx.ext.autodoc', 'sphinx.ext.autosummary', 'sphinx.ext.viewcode', + 'IPython.sphinxext.ipython_directive' ] # Config autosummary @@ -147,6 +148,7 @@ # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = [] +ipython_savefig_dir = '../_build/html/_static' # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 7aaed0f49..25ad56ad9 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -1,16 +1,66 @@ Start Here: ``datascience`` Tutorial ==================================== -In progress. +This is general introduction to the functionality in +:py:mod:`datascience`. For a reference guide, please see +:ref:`tables-overview`. Introduction ------------ +First, load the :py:mod:`datascience` module: + +.. ipython:: python + + import datascience as ds + +In the IPython notebook, type `ds.` followed by the TAB-key to see a list of members. +The most important of these is the :py:class:`Table` class, which is +the structure used to represent columns of data. + Basic Table Usage ----------------- +A table is constructed as follows: + +.. ipython:: python + + letters = ['a', 'b', 'c', 'z'] + counts = [9, 3, 3, 1] + points = [1, 2, 2, 10] + + t = ds.Table(columns=[letters, counts, points], + labels=['letter', 'count', 'points']) + + print(t) + +Note how the first keyword, ``columns``, specifies the contents of the +table, and how the second, ``labels``, gives a name to each column. + +A table could also be read from a CSV file (that can be exported from +an Excel spreadsheet, for example). Here's the content of the file: + +.. ipython:: python + + cat mydata.csv + +And this is how we load it in as a :class:`Table`: + +.. ipython:: python + + t = ds.Table.read_table('mydata.csv') + print(t) + More Advanced Table Usage ------------------------- +Once a table has been constructed, we can do various queries on it. + +Print the first two entries: + +.. ipython:: python + + print(t[:2]) + Drawing Maps ------------ From 3013c702c2e1fc5d1b6507b1ecbde5cf27cbb68c Mon Sep 17 00:00:00 2001 From: Sam Lau Date: Mon, 21 Dec 2015 03:14:16 -0800 Subject: [PATCH 02/16] Write first few sections of the tutorial The rest is to come. --- docs/sample.csv | 4 ++ docs/tutorial.rst | 115 ++++++++++++++++++++++++++++++++++++---------- 2 files changed, 94 insertions(+), 25 deletions(-) create mode 100644 docs/sample.csv diff --git a/docs/sample.csv b/docs/sample.csv new file mode 100644 index 000000000..ecee95a65 --- /dev/null +++ b/docs/sample.csv @@ -0,0 +1,4 @@ +x,y,z +1,10,100 +2,11,101 +3,12,102 diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 25ad56ad9..82b4ce969 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -1,27 +1,57 @@ Start Here: ``datascience`` Tutorial ==================================== -This is general introduction to the functionality in -:py:mod:`datascience`. For a reference guide, please see +This is a brief introduction to the functionality in +:py:mod:`datascience`. For a complete reference guide, please see :ref:`tables-overview`. -Introduction ------------- +For other useful tutorials and examples, see: + +- `The textbook introduction to Tables`_ +- `Example notebooks`_ + +.. _The textbook introduction to Tables: http://data8.org/text/1_data.html#tables +.. _Example notebooks: https://github.com/deculler/TableDemos -First, load the :py:mod:`datascience` module: +Getting Started +--------------- + +The most important functionality in the package is is the :py:class:`Table` +class, which is the structure used to represent columns of data. You may load +the class with: .. ipython:: python - import datascience as ds + from datascience import Table -In the IPython notebook, type `ds.` followed by the TAB-key to see a list of members. -The most important of these is the :py:class:`Table` class, which is -the structure used to represent columns of data. +In the IPython notebook, type ``Table.`` followed by the TAB-key to see a list of +members. -Basic Table Usage ------------------ +Note that for the Data Science 8 class we also import additional packages and +settings for all assignments and labs. This is so that plots and other available +packages mirror the ones in the textbook more closely. The exact code we use is: + +.. code-block:: python + + # HIDDEN + + import matplotlib + matplotlib.use('Agg') + from datascience import Table + %matplotlib inline + import matplotlib.pyplot as plots + import numpy as np + plots.style.use('fivethirtyeight') + +In particular, the lines involving ``matplotlib`` allow for plotting within the +IPython notebook. + +Creating a Table +---------------- + +A Table is a sequence of labeled columns of data. -A table is constructed as follows: +The basic Table constructor works as follows: .. ipython:: python @@ -29,38 +59,73 @@ A table is constructed as follows: counts = [9, 3, 3, 1] points = [1, 2, 2, 10] - t = ds.Table(columns=[letters, counts, points], - labels=['letter', 'count', 'points']) + t = Table(columns=[letters, counts, points], + labels=['letter', 'count', 'points']) print(t) -Note how the first keyword, ``columns``, specifies the contents of the -table, and how the second, ``labels``, gives a name to each column. +Note how the first keyword, ``columns``, specifies the contents of the table, +and how the second, ``labels``, gives a name to each column. See +:meth:`~datascience.tables.Table.__init__` for more details. -A table could also be read from a CSV file (that can be exported from -an Excel spreadsheet, for example). Here's the content of the file: +------ + +A table could also be read from a CSV file (that can be exported from an Excel +spreadsheet, for example). Here's the content of an example file: .. ipython:: python cat mydata.csv -And this is how we load it in as a :class:`Table`: +And this is how we load it in as a :class:`Table` using +:meth:`~datascience.tables.Table.read_table`: .. ipython:: python - t = ds.Table.read_table('mydata.csv') + t = Table.read_table('sample.csv') print(t) -More Advanced Table Usage -------------------------- +CSVs from URLs are also valid inputs to +:meth:`~datascience.tables.Table.read_table`: + +.. ipython:: python -Once a table has been constructed, we can do various queries on it. + Table.read_table('http://data8.org/text/sat2014.csv') -Print the first two entries: +------ + +For convenience, you can also initialize a Table from a dictionary of column +names using +:meth:`~datascience.tables.Table.from_columns_dict`. .. ipython:: python - print(t[:2]) + Table.from_columns_dict({ + 'letter': letters, + 'count': counts, + 'points': points, + }) + +This example illustrates the fact that built-in Python dictionaries don't +preserve their key order. If you want to ensure the order of your columns, use +an ``OrderedDict``. + +Accessing Values +---------------- +To come. + +Manipulating Data +----------------- +To come. + +Visualizing Data +---------------- +To come. + +An Example +---------- +To come. Drawing Maps ------------ +To come. From 440c377642102fddc86cec153b5026f78fe03edc Mon Sep 17 00:00:00 2001 From: Sam Lau Date: Tue, 22 Dec 2015 13:10:45 -0800 Subject: [PATCH 03/16] Add highlighting for IPython blocks --- docs/conf.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/conf.py b/docs/conf.py index 2bec3ace8..fba253690 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -35,6 +35,9 @@ 'sphinx.ext.autodoc', 'sphinx.ext.autosummary', 'sphinx.ext.viewcode', + # These IPython extensions allow for embedded IPython code that gets rerun + # at build time. + 'IPython.sphinxext.ipython_console_highlighting', 'IPython.sphinxext.ipython_directive' ] From 86ace8854cb9800980a01d2271b50043f0929763 Mon Sep 17 00:00:00 2001 From: Chris Holdgraf Date: Tue, 22 Dec 2015 15:30:43 -0600 Subject: [PATCH 04/16] Added a check for python 3 --- setup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/setup.py b/setup.py index 644dadf47..307f4953a 100644 --- a/setup.py +++ b/setup.py @@ -3,6 +3,8 @@ from setuptools.command.test import test as TestCommand +if sys.version_info < (3, 0): + raise ValueError('This package requires python >= 3.0') with open('requirements.txt') as fid: install_requires = [l.strip() for l in fid.readlines() if l] From d0113418b5ac0db91d5bac3663ca46cbcf7096ce Mon Sep 17 00:00:00 2001 From: Sam Lau Date: Tue, 22 Dec 2015 16:37:01 -0800 Subject: [PATCH 05/16] Write accessing values section --- docs/tutorial.rst | 77 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 56 insertions(+), 21 deletions(-) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 82b4ce969..09940c672 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -24,8 +24,8 @@ the class with: from datascience import Table -In the IPython notebook, type ``Table.`` followed by the TAB-key to see a list of -members. +In the IPython notebook, type ``Table.`` followed by the TAB-key to see a list +of members. Note that for the Data Science 8 class we also import additional packages and settings for all assignments and labs. This is so that plots and other available @@ -55,14 +55,14 @@ The basic Table constructor works as follows: .. ipython:: python - letters = ['a', 'b', 'c', 'z'] - counts = [9, 3, 3, 1] - points = [1, 2, 2, 10] + letters = ['a', 'b', 'c', 'z'] + counts = [9, 3, 3, 1] + points = [1, 2, 2, 10] - t = Table(columns=[letters, counts, points], - labels=['letter', 'count', 'points']) + t = Table(columns=[letters, counts, points], + labels=['letter', 'count', 'points']) - print(t) + print(t) Note how the first keyword, ``columns``, specifies the contents of the table, and how the second, ``labels``, gives a name to each column. See @@ -75,22 +75,22 @@ spreadsheet, for example). Here's the content of an example file: .. ipython:: python - cat mydata.csv + cat mydata.csv And this is how we load it in as a :class:`Table` using :meth:`~datascience.tables.Table.read_table`: .. ipython:: python - t = Table.read_table('sample.csv') - print(t) + t = Table.read_table('sample.csv') + print(t) CSVs from URLs are also valid inputs to :meth:`~datascience.tables.Table.read_table`: .. ipython:: python - Table.read_table('http://data8.org/text/sat2014.csv') + Table.read_table('http://data8.org/text/sat2014.csv') ------ @@ -100,23 +100,58 @@ names using .. ipython:: python - Table.from_columns_dict({ - 'letter': letters, - 'count': counts, - 'points': points, - }) + Table.from_columns_dict({ + 'letter': letters, + 'count': counts, + 'points': points, + }) This example illustrates the fact that built-in Python dictionaries don't -preserve their key order. If you want to ensure the order of your columns, use -an ``OrderedDict``. +preserve their key order -- the dictionary keys are ordered 'letter', 'count', +then 'points', but the table columns are ordered 'points', 'count', then +'letter'). If you want to ensure the order of your columns, use an +``OrderedDict``. Accessing Values ---------------- -To come. + +To access values of columns in the table, use +:meth:`~datascience.tables.Table.values`. + +.. ipython:: python + + t + + t.values('x') + t.values('y') + + t['x'] # This is a shorthand for t.values('x') + +To access values by row, :meth:`~datascience.tables.Table.rows` returns an +list-like :class:`~datascience.tables.Table.Rows` object that contains +tuple-like :class:`~datascience.tables.Table.Row` objects. + +.. ipython:: python + + t.rows + t.rows[0] + + second = t.rows[1] + second + second[0] + second[1] + +To get the number of rows, use :meth:`~datascience.tables.Table.num_rows`. + +.. ipython:: python + + t.num_rows + Manipulating Data ----------------- -To come. + + Visualizing Data ---------------- From ba3fbf15660cf9a58881f84513b2f36b2019f1e4 Mon Sep 17 00:00:00 2001 From: Sam Lau Date: Wed, 23 Dec 2015 16:47:23 -0800 Subject: [PATCH 06/16] Fix error in figure path for ipython plots --- docs/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index fba253690..5bc561ff8 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -151,7 +151,7 @@ # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = [] -ipython_savefig_dir = '../_build/html/_static' +ipython_savefig_dir = './_build/html/_images' # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied From 8368b1d9d6b588dcbe1dd6a5fcb83675f3da01bf Mon Sep 17 00:00:00 2001 From: Sam Lau Date: Wed, 23 Dec 2015 16:47:59 -0800 Subject: [PATCH 07/16] Write rest of tutorial Except for the example --- .gitignore | 1 + docs/tutorial.rst | 162 ++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 152 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index cc5c9d120..5747a1fc0 100644 --- a/.gitignore +++ b/.gitignore @@ -61,3 +61,4 @@ cache/ docs/_build/ docs/_autosummary/ +docs/normal_data.csv diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 09940c672..3585392f5 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -82,8 +82,7 @@ And this is how we load it in as a :class:`Table` using .. ipython:: python - t = Table.read_table('sample.csv') - print(t) + Table.read_table('sample.csv') CSVs from URLs are also valid inputs to :meth:`~datascience.tables.Table.read_table`: @@ -107,10 +106,10 @@ names using }) This example illustrates the fact that built-in Python dictionaries don't -preserve their key order -- the dictionary keys are ordered 'letter', 'count', -then 'points', but the table columns are ordered 'points', 'count', then -'letter'). If you want to ensure the order of your columns, use an -``OrderedDict``. +preserve their key order -- the dictionary keys are ordered ``'letter'``, +``'count'``, then ``'points'``, but the table columns are ordered ``'points'``, +``'count'``, then ``'letter'``). If you want to ensure the order of your +columns, use an ``OrderedDict``. Accessing Values ---------------- @@ -122,10 +121,10 @@ To access values of columns in the table, use t - t.values('x') - t.values('y') + t.values('letter') + t.values('count') - t['x'] # This is a shorthand for t.values('x') + t['letter'] # This is a shorthand for t.values('letter') To access values by row, :meth:`~datascience.tables.Table.rows` returns an list-like :class:`~datascience.tables.Table.Rows` object that contains @@ -141,7 +140,7 @@ tuple-like :class:`~datascience.tables.Table.Row` objects. second[0] second[1] -To get the number of rows, use :meth:`~datascience.tables.Table.num_rows`. +To get the number of rows, use :attr:`~datascience.tables.Table.num_rows`. .. ipython:: python @@ -151,11 +150,152 @@ To get the number of rows, use :meth:`~datascience.tables.Table.num_rows`. Manipulating Data ----------------- +Here are some of the most common operations on data. For the rest, see the +reference (:ref:`tables-overview`). + +Adding a column with :meth:`~datascience.tables.Table.with_column`: + +.. ipython:: python + + t + t.with_column('vowel?', ['yes', 'no', 'no', 'no']) + t # .with_column returns a new table without modifying the original + + t.with_column('2 * count', t['count'] * 2) # A simple way to operate on columns + +Selecting columns with :meth:`~datascience.tables.Table.select`: + +.. ipython:: python + + t.select('letter') + t.select(['letter', 'points']) + +Renaming columns with :meth:`~datascience.tables.Table.with_relabeling`: + +.. ipython:: python + + t + t.with_relabeling('points', 'other name') + t + t.with_relabeling(['letter', 'count', 'points'], ['x', 'y', 'z']) + +Selecting out rows by index with :meth:`~datascience.tables.Table.take` and +conditionally with :meth:`~datascience.tables.Table.where`: + +.. ipython:: python + + t + t.take(2) # the third row + t.take[0:2] # the first and second rows + +.. ipython:: python + + t.where('points', 2) # rows where points == 2 + t.where(t['count'] < 8) # rows where count < 8 + + t['count'] < 8 # .where actually takes in an array of booleans + t.where([False, True, True, True]) # same as the last line + +Operate on table data with :meth:`~datascience.tables.Table.sort`, +:meth:`~datascience.tables.Table.group`, and +:meth:`~datascience.tables.Table.pivot` + +.. ipython:: python + + t + t.sort('count') + t.sort('letter', descending = True) + +.. ipython:: python + + t.group('count') + + # You may pass a reducing function into the collect arg + # Note the renaming of the points column because of the collect arg + t.select(['count', 'points']).group('count', collect = sum) + +.. ipython:: python + + other_table = Table([ + ['married', 'married', 'partner', 'partner', 'married'], + ['Working as paid', 'Working as paid', 'Not working', 'Not working', 'Not working'], + [1, 1, 1, 1, 1] + ], + ['mar_status', 'empl_status', 'count']) + other_table + other_table.pivot('mar_status', 'empl_status', 'count', collect = sum) Visualizing Data ---------------- -To come. + +We'll start with some data drawn at random from two normal distributions: + +.. ipython:: python + + normal_data = Table( + [ np.random.normal(loc = 1, scale = 2, size = 100), + np.random.normal(loc = 4, scale = 3, size = 100) ], + ['data1', 'data2'] + }) + + normal_data + +Draw histograms with :meth:`~datascience.tables.Table.hist`: + +.. ipython:: python + + @savefig hist.png width=4in + normal_data.hist() + +.. ipython:: python + + @savefig hist_binned.png width=4in + normal_data.hist(bins = range(-5, 10)) + +.. ipython:: python + + @savefig hist_overlay.png width=4in + normal_data.hist(bins = range(-5, 10), overlay = True) + +If we treat the ``normal_data`` table as a set of x-y points, we can +:meth:`~datascience.tables.Table.plot` and +:meth:`~datascience.tables.Table.scatter`: + +.. ipython:: python + + @savefig plot.png width=4in + normal_data.sort('data1').plot('data1') # Sort first to make plot nicer + +.. ipython:: python + + @savefig scatter.png width=4in + normal_data.scatter('data1') + +.. ipython:: python + + @savefig scatter_line.png width=4in + normal_data.scatter('data1', fit_line = True) + +Use :meth:`~datascience.tables.Table.barh` to display categorical data. + +.. ipython:: python + + t + t.barh('letter') + +Exporting +--------- + +Exporting to CSV is the most common operation and can be done by first +converting to a pandas dataframe with :meth:`~datascience.tables.Table.to_df`: + +.. ipython:: python + + normal_data + + # index = False prevents row numbers from appearing in the resulting CSV + normal_data.to_df().to_csv('normal_data.csv', index = False) An Example ---------- From 9a675087c7512a070349f5ada811a97893057150 Mon Sep 17 00:00:00 2001 From: Sam Lau Date: Wed, 23 Dec 2015 17:29:31 -0800 Subject: [PATCH 08/16] Fix typo in tutorial and silence matplotlib warning --- docs/conf.py | 11 +++++++++++ docs/tutorial.rst | 28 ++++++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 5bc561ff8..1e5fceac7 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -41,6 +41,17 @@ 'IPython.sphinxext.ipython_directive' ] +# The following lines silence the matplotlib.use warnings since we import +# matplotlib in each ipython directive block +ipython_mplbackend = None +ipython_execlines = [ + 'import matplotlib', + 'matplotlib.use("Agg", warn=False)', + 'import numpy as np', + 'import matplotlib.pyplot as plt', + 'plt.style.use("fivethirtyeight")', +] + # Config autosummary autosummary_generate = True diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 3585392f5..7f7db8dd7 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -237,7 +237,7 @@ We'll start with some data drawn at random from two normal distributions: [ np.random.normal(loc = 1, scale = 2, size = 100), np.random.normal(loc = 4, scale = 3, size = 100) ], ['data1', 'data2'] - }) + ) normal_data @@ -299,7 +299,31 @@ converting to a pandas dataframe with :meth:`~datascience.tables.Table.to_df`: An Example ---------- -To come. + +Because most methods return a new Table, we can chain the above methods to +work with data easily. + +We'll recreate the steps in `Chapter 3 of the textbook`_ to see if there is a +significant difference in birth weights between smokers and non-smokers using a +bootstrap test. + +.. _Chapter 3 of the textbook: http://data8.org/text/3_inference.html#Using-the-Bootstrap-Method-to-Test-Hypotheses + +From the text: + + The table ``baby`` contains data on a random sample of 1,174 mothers and + their newborn babies. The column ``birthwt`` contains the birth weight of + the baby, in ounces; ``gest_days`` is the number of gestational days, that + is, the number of days the baby was in the womb. There is also data on + maternal age, maternal height, maternal pregnancy weight, and whether or not + the mother was a smoker. + +.. ipython:: python + + baby = Table.read_table('http://data8.org/text/baby.csv') + baby # Let's take a peek at the table + + Drawing Maps ------------ From ae4dd7e6a266bddae3ca997105602d0989f97289 Mon Sep 17 00:00:00 2001 From: Sam Lau Date: Wed, 23 Dec 2015 18:52:58 -0800 Subject: [PATCH 09/16] Add example and finish tutorial --- docs/tutorial.rst | 68 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 65 insertions(+), 3 deletions(-) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 7f7db8dd7..a3ac92457 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -282,6 +282,7 @@ Use :meth:`~datascience.tables.Table.barh` to display categorical data. .. ipython:: python t + @savefig barh.png width=4in t.barh('letter') Exporting @@ -300,14 +301,14 @@ converting to a pandas dataframe with :meth:`~datascience.tables.Table.to_df`: An Example ---------- -Because most methods return a new Table, we can chain the above methods to -work with data easily. - We'll recreate the steps in `Chapter 3 of the textbook`_ to see if there is a significant difference in birth weights between smokers and non-smokers using a bootstrap test. +For more examples, check out `the TableDemos repo`_. + .. _Chapter 3 of the textbook: http://data8.org/text/3_inference.html#Using-the-Bootstrap-Method-to-Test-Hypotheses +.. _the TableDemos repo: https://github.com/deculler/TableDemos From the text: @@ -323,6 +324,67 @@ From the text: baby = Table.read_table('http://data8.org/text/baby.csv') baby # Let's take a peek at the table + # Select out columns we want. + smoker_and_wt = baby.select(['m_smoker', 'birthwt']) + smoker_and_wt + +Let's compare the number of smokers to non-smokers. + +.. ipython:: python + + @savefig m_smoker.png width=4in + smoker_and_wt.select('m_smoker').hist(bins = [0, 1, 2]); + +We can also compare the distribution of birthweights between smokers and +non-smokers. + +.. ipython:: python + + # Non smokers + # We do this by grabbing the rows that correspond to mothers that don't + # smoke, then plotting a histogram of just the birthweights. + @savefig not_m_smoker_weights.png width=4in + smoker_and_wt.where('m_smoker', 0).select('birthwt').hist() + + # Smokers + @savefig m_smoker_weights.png width=4in + smoker_and_wt.where('m_smoker', 1).select('birthwt').hist() + +What's the difference in mean birth weight of the two categories? + +.. ipython:: python + + nonsmoking_mean = smoker_and_wt.where('m_smoker', 0).values('birthwt').mean() + smoking_mean = smoker_and_wt.where('m_smoker', 1).values('birthwt').mean() + + observed_diff = nonsmoking_mean - smoking_mean + observed_diff + +Let's do the bootstrap test on the two categories. + +.. ipython:: python + + num_nonsmokers = smoker_and_wt.where('m_smoker', 0).num_rows + def bootstrap_once(): + """ + Computes one bootstrapped difference in means. + The table.sample method lets us take random samples. + We then split according to the number of nonsmokers in the original sample. + """ + resample = smoker_and_wt.sample(with_replacement = True) + bootstrap_diff = resample.values('birthwt')[:num_nonsmokers].mean() - \ + resample.values('birthwt')[num_nonsmokers:].mean() + return bootstrap_diff + + repetitions = 1000 + bootstrapped_diff_means = np.array( + [ bootstrap_once() for _ in range(repetitions) ]) + + bootstrapped_diff_means[:10] + + num_diffs_greater = (abs(bootstrapped_diff_means) > abs(observed_diff)).sum() + p_value = num_diffs_greater / len(bootstrapped_diff_means) + p_value Drawing Maps From 994d8a0ae63739b19dc134f9b066b235203a617c Mon Sep 17 00:00:00 2001 From: Sam Lau Date: Thu, 24 Dec 2015 00:00:30 -0800 Subject: [PATCH 10/16] Use plt instead of plots --- docs/tutorial.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index a3ac92457..1328bae9e 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -39,9 +39,9 @@ packages mirror the ones in the textbook more closely. The exact code we use is: matplotlib.use('Agg') from datascience import Table %matplotlib inline - import matplotlib.pyplot as plots + import matplotlib.pyplot as plt import numpy as np - plots.style.use('fivethirtyeight') + plt.style.use('fivethirtyeight') In particular, the lines involving ``matplotlib`` allow for plotting within the IPython notebook. From 30eda1064631d5c65a24a16f4b445bcc9bc38584 Mon Sep 17 00:00:00 2001 From: Sam Lau Date: Sun, 20 Dec 2015 21:57:52 -0800 Subject: [PATCH 11/16] Add methods we forgot to document before to docs Table.exclude and Table.boxplot are available but were not put into the docs. This commit fixes that. I also reordered methods to better match the ordering in the docs. --- datascience/tables.py | 328 +++++++++++++++++++++--------------------- docs/tables.rst | 2 + 2 files changed, 166 insertions(+), 164 deletions(-) diff --git a/datascience/tables.py b/datascience/tables.py index 3ed3dcf9e..50fb1d5cb 100644 --- a/datascience/tables.py +++ b/datascience/tables.py @@ -246,15 +246,6 @@ def __init__(self, columns=None, labels=None, self.take = _RowTaker(self) self.exclude = _RowExcluder(self) - # These, along with a snippet below, are necessary for Sphinx to - # correctly load the `take` and `exclude` docstrings. The definitions - # will be over-ridden during class instantiation. - def take(self): - raise NotImplementedError() - - def exclude(self): - raise NotImplementedError() - @classmethod def empty(cls, column_labels=None): """Create an empty table. Column labels are optional @@ -345,6 +336,18 @@ def _add_column_and_format(self, table, label, column): if label in self._formats: table._formats[label] = self._formats[label] + @classmethod + def from_df(cls, df): + """Convert a Pandas DataFrame into a Table.""" + labels = df.columns + return Table([df[label].values for label in labels], labels) + + @classmethod + def from_array(cls, arr): + """Convert a structured NumPy array into a Table.""" + return Table([arr[f] for f in arr.dtype.names], + labels=arr.dtype.names) + ################# # Magic Methods # @@ -711,6 +714,15 @@ def select(self, column_label_or_labels): self._add_column_and_format(table, label, np.copy(self[label])) return table + # These, along with a snippet below, are necessary for Sphinx to + # correctly load the `take` and `exclude` docstrings. The definitions + # will be over-ridden during class instantiation. + def take(self): + raise NotImplementedError() + + def exclude(self): + raise NotImplementedError() + def drop(self, column_label_or_labels): """Return a Table with only columns other than selected label or labels.""" exclude = _as_labels(column_label_or_labels) @@ -1298,18 +1310,6 @@ def index_by(self, column_or_label): index.setdefault(key, []).append(row) return index - @classmethod - def from_df(cls, df): - """Convert a Pandas DataFrame into a Table.""" - labels = df.columns - return Table([df[label].values for label in labels], labels) - - @classmethod - def from_array(cls, arr): - """Convert a structured NumPy array into a Table.""" - return Table([arr[f] for f in arr.dtype.names], - labels=arr.dtype.names) - def to_df(self): """Convert the table to a Pandas DataFrame.""" return pandas.DataFrame(self._columns) @@ -1348,6 +1348,28 @@ def to_array(self): 'alpha': 0.8, } + def _visualize(self, x_label, y_labels, ticks, overlay, draw, annotate, width=6, height=4): + """Generic visualization that overlays or separates the draw function.""" + n = len(y_labels) + colors = list(itertools.islice(itertools.cycle(self.chart_colors), n)) + if overlay: + _, axis = plt.subplots(figsize=(width, height)) + for label, color in zip(y_labels, colors): + draw(axis, label, color) + if ticks is not None: + annotate(axis, ticks) + axis.legend(y_labels, bbox_to_anchor=(1.5, 1.0)) + else: + fig, axes = plt.subplots(n, 1, figsize=(width, height * n)) + if not isinstance(axes, collections.Iterable): + axes=[axes] + for axis, y_label, color in zip(axes, y_labels, colors): + draw(axis, y_label, color) + axis.set_ylabel(y_label, fontsize=16) + axis.set_xlabel(x_label, fontsize=16) + if ticks is not None: + annotate(axis, ticks) + def plot(self, column_for_xticks, overlay=False, **vargs): """Plot contents as lines.""" options = self.default_options.copy() @@ -1366,84 +1388,70 @@ def annotate(axis, ticks): self._visualize(column_for_xticks, y_labels, xticks, overlay, draw, annotate) - def scatter(self, column_for_x, overlay=False, fit_line=False, **vargs): - """Creates scatterplots, optionally adding a line of best fit. + def bar(self, column_for_categories=None, overlay=False, **vargs): + """Plots bar charts for the table. - All scatterplots use the values in ``column_for_x`` as the x-values. A - total of n - 1 scatterplots are created where n is the number of - columns in the table, one for every column other than ``column_for_x``. + Each chart is categorized using the values in `column_for_categories` + and one chart is produced for every other column in the table. + A total of n - 1 charts are created where n is the number of columns + in the table. - Requires all columns in the table to contain numerical values only. - If the columns contain other types, a ``ValueError`` is raised. + Requires every column except for `column_for_categories` to be + numerical. If the columns contain other types, a `ValueError` is + raised. Args: - ``column_for_x`` (str): The name to use for the x-axis values of the - scatter plots. + column_for_categories (str): The name to use for the bar chart + categories Kwargs: - ``overlay`` (bool): If True, creates one scatterplot with n - 1 - y-values plotted, one for each column other than - ``column_for_x`` (instead of the default behavior of creating n - - 1 scatterplots. Also adds a legend that matches each dot - and best-fit line color to its column. - - ``fit_line`` (bool): If True, draws a line of best fit for each - scatterplot drawn. + overlay (bool): If True, creates one chart with n - 1 bars for each + category, one for each column other than `column_for_categories` + (instead of the default behavior of creating n - 1 charts). + Also adds a legend that matches each bar color to its column. - ``vargs``: Additional arguments that get passed into `plt.scatter`. - See http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.scatter + vargs: Additional arguments that get passed into `plt.bar`. + See http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.bar for additional arguments that can be passed into vargs. These - include: `marker` and `norm`, to name a couple. + include: `linewidth`, `xerr`, `yerr`, and `log`, to name a few. Returns: None Raises: - ``ValueError``: The table contains non-numerical values in columns. - - >>> x = [9, 3, 3, 1] - >>> y = [1, 2, 2, 10] - >>> z = [3, 4, 5, 6] - >>> table = Table([x, y, z], ['x', 'y', 'z']) - >>> table - x | y | z - 9 | 1 | 3 - 3 | 2 | 4 - 3 | 2 | 5 - 1 | 10 | 6 - >>> table.scatter('x') # doctest: +SKIP - - - - >>> table.scatter('x', overlay = True) # doctest: +SKIP - - - >>> table.scatter('x', fit_line = True) # doctest: +SKIP - - + ValueError: The Table contains non-numerical values in columns + other than `column_for_categories` """ - # Check for non-numerical values and raise a ValueError if any found - for col in self: - if any(isinstance(cell, np.flexible) for cell in self[col]): - raise ValueError("The column '{0}' contains non-numerical " - "values. A histogram cannot be drawn for this table." - .format(col)) - options = self.default_options.copy() options.update(vargs) - xdata, y_labels = self._split_by_column(column_for_x) + + xticks, y_labels = self._split_by_column(column_for_categories) + for label in y_labels: + if any(isinstance(cell, np.flexible) for cell in self[label]): + raise ValueError("The column '{0}' contains non-numerical " + "values. A bar graph cannot be drawn for this table." + .format(label)) + + index = np.arange(self.num_rows) + margin = 0.1 + width = 1 - 2 * margin + if overlay: + width /= len(y_labels) def draw(axis, label, color): - axis.scatter(xdata, self[label], color=color, **options) - if fit_line: - m,b = np.polyfit(xdata, self[label], 1) - minx, maxx = np.min(xdata),np.max(xdata) - axis.plot([minx,maxx],[m*minx+b,m*maxx+b]) + if overlay: + xpos = index + margin + (1-2*margin)*labels.index(label)/len(labels) + else: + xpos = index + axis.bar(xpos, self[label], 1.0, color=color, **options) def annotate(axis, ticks): + if (ticks is not None) : + tick_labels = [ticks[int(l)] for l in axis.get_xticks() if l>> x = [9, 3, 3, 1] + >>> y = [1, 2, 2, 10] + >>> z = [3, 4, 5, 6] + >>> table = Table([x, y, z], ['x', 'y', 'z']) + >>> table + x | y | z + 9 | 1 | 3 + 3 | 2 | 4 + 3 | 2 | 5 + 1 | 10 | 6 + >>> table.scatter('x') # doctest: +SKIP + + + + >>> table.scatter('x', overlay = True) # doctest: +SKIP + + + >>> table.scatter('x', fit_line = True) # doctest: +SKIP + + + + """ + # Check for non-numerical values and raise a ValueError if any found + for col in self: + if any(isinstance(cell, np.flexible) for cell in self[col]): + raise ValueError("The column '{0}' contains non-numerical " + "values. A histogram cannot be drawn for this table." + .format(col)) + + options = self.default_options.copy() + options.update(vargs) + xdata, y_labels = self._split_by_column(column_for_x) + + def draw(axis, label, color): + axis.scatter(xdata, self[label], color=color, **options) + if fit_line: + m,b = np.polyfit(xdata, self[label], 1) + minx, maxx = np.min(xdata),np.max(xdata) + axis.plot([minx,maxx],[m*minx+b,m*maxx+b]) + + def annotate(axis, ticks): + return None + self._visualize(column_for_x, y_labels, None, overlay, draw, annotate) + ########### # Support # ########### diff --git a/docs/tables.rst b/docs/tables.rst index a0579bd5a..8bd632024 100644 --- a/docs/tables.rst +++ b/docs/tables.rst @@ -86,6 +86,7 @@ Transformation (creates a new table) Table.select Table.drop Table.take + Table.exclude Table.where Table.sort Table.group @@ -125,3 +126,4 @@ Visualizations Table.hist Table.points Table.scatter + Table.boxplot From aef1001befd7bbecd67d3e3c464e01a9cae4dc08 Mon Sep 17 00:00:00 2001 From: Sam Lau Date: Wed, 23 Dec 2015 19:05:11 -0800 Subject: [PATCH 12/16] Add to_csv method This is a convenience method that is commonly requested. --- datascience/tables.py | 29 +++++++++++++++++++++++++++++ docs/tables.rst | 1 + 2 files changed, 30 insertions(+) diff --git a/datascience/tables.py b/datascience/tables.py index 50fb1d5cb..23b118e84 100644 --- a/datascience/tables.py +++ b/datascience/tables.py @@ -1314,6 +1314,35 @@ def to_df(self): """Convert the table to a Pandas DataFrame.""" return pandas.DataFrame(self._columns) + def to_csv(self, filename): + """Creates a CSV file with the provided filename. + + The CSV is created in such a way that if we run + ``table.to_csv('my_table.csv')`` we can recreate the same table with + ``Table.read_table('my_table.csv')``. + + Args: + ``filename`` (str): The filename of the output CSV file. + + Returns: + None, outputs a file with name ``filename``. + + >>> job = ['a', 'b', 'c', 'd'] + >>> wage = [10, 20, 15, 8] + >>> some_table = Table([job, wage], ['job', 'wage']) + >>> some_table + job | wage + a | 10 + b | 20 + c | 15 + d | 8 + >>> some_table.to_csv('my_table.csv') # doctest: +SKIP + + """ + # We use index = False to avoid the row number output that pandas does + # by default. + self.to_df().to_csv(filename, index = False) + def to_array(self): """Convert the table to a NumPy array.""" dt = np.dtype(list(zip(self.column_labels, diff --git a/docs/tables.rst b/docs/tables.rst index 8bd632024..a3fe8ab54 100644 --- a/docs/tables.rst +++ b/docs/tables.rst @@ -113,6 +113,7 @@ Exporting / Displaying Table.index_by Table.to_array Table.to_df + Table.to_csv Visualizations From ef6199c5525e1ae4e0e65bb9c850b75f83f66d72 Mon Sep 17 00:00:00 2001 From: Sam Lau Date: Sat, 2 Jan 2016 16:19:38 -0800 Subject: [PATCH 13/16] Version 0.3.dev22 --- datascience/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datascience/version.py b/datascience/version.py index 1e70f4a0c..89d7e2955 100644 --- a/datascience/version.py +++ b/datascience/version.py @@ -1 +1 @@ -__version__ = '0.3.dev21' +__version__ = '0.3.dev22' From 20b5efb957c8d488ecc78384d4daaa9e9131f533 Mon Sep 17 00:00:00 2001 From: Sam Lau Date: Mon, 4 Jan 2016 00:23:58 -0800 Subject: [PATCH 14/16] Add TOC to tutorial and try to fix Travis again I'm getting desperate, as you can tell. --- .travis.yml | 6 ++++++ docs/tutorial.rst | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/.travis.yml b/.travis.yml index 630648fcd..a68a990b7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -35,6 +35,12 @@ install: # TODO(sam): Add --upgrade flag when it works again - python3 setup.py install +# https://docs.travis-ci.com/user/gui-and-headless-browsers/#Using-xvfb-to-Run-Tests-That-Require-a-GUI +before_script: + - "export DISPLAY=:99.0" + - "sh -e /etc/init.d/xvfb start" + - sleep 3 # give xvfb some time to start + script: - coverage run setup.py test - cd docs && make html-raise-on-warning && cd .. diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 1328bae9e..20cd18c1c 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -13,6 +13,10 @@ For other useful tutorials and examples, see: .. _The textbook introduction to Tables: http://data8.org/text/1_data.html#tables .. _Example notebooks: https://github.com/deculler/TableDemos +.. contents:: Table of Contents + :depth: 2 + :local: + Getting Started --------------- From 9e8b6eb70271adbaf2c6d4e6453e71a54578b7c0 Mon Sep 17 00:00:00 2001 From: Sam Lau Date: Mon, 4 Jan 2016 00:47:02 -0800 Subject: [PATCH 15/16] Fix buggy travis cache and ensure _images folder exists Really crossing my fingers now! --- .travis.yml | 8 +++----- docs/Makefile | 2 ++ 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index a68a990b7..28dfc1a8d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -26,7 +26,7 @@ addons: before_install: - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh - chmod +x miniconda.sh - - ./miniconda.sh -b -p $HOME/miniconda + - ./miniconda.sh -b -f -p $HOME/miniconda - export PATH=/home/travis/miniconda/bin:$PATH - conda update --yes conda @@ -36,6 +36,8 @@ install: - python3 setup.py install # https://docs.travis-ci.com/user/gui-and-headless-browsers/#Using-xvfb-to-Run-Tests-That-Require-a-GUI +# sam: Not exactly sure why we need to initialize a display for this but it +# helps the tutorial plots build on Travis before_script: - "export DISPLAY=:99.0" - "sh -e /etc/init.d/xvfb start" @@ -48,7 +50,3 @@ script: after_success: - coveralls - bash tools/deploy_docs.sh - -cache: - directories: - - /home/travis/virtualenv/python3.4.2/ diff --git a/docs/Makefile b/docs/Makefile index fd5eddeb2..b05545c01 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -52,11 +52,13 @@ clean: rm -rf $(BUILDDIR)/* html: + mkdir -p $(BUILDDIR)/html/_images $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." html-raise-on-warning: + mkdir -p $(BUILDDIR)/html/_images $(SPHINXBUILD) -W -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html dirhtml: From 88d21e17d2efa20d30d4f1401b6dbf330fcf9b9b Mon Sep 17 00:00:00 2001 From: Sam Lau Date: Mon, 4 Jan 2016 01:08:36 -0800 Subject: [PATCH 16/16] Just use `make docs` instead of raise-on-warning Since we have warnings I can't currently get around when we generate images in the tutorial. --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 28dfc1a8d..3ec57d10c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -45,7 +45,7 @@ before_script: script: - coverage run setup.py test - - cd docs && make html-raise-on-warning && cd .. + - make docs after_success: - coveralls