probcomp · gregory-marton · Jun 9, 2016 · Jun 6, 2016 · Jun 6, 2016 · Jun 6, 2016
diff --git a/src/mml_utils.py b/src/mml_utils.py
@@ -3,9 +3,11 @@
 from enum import Enum
 from string import Template
 
+import copy
 import json
 import jsonschema
 import pkgutil
+import uuid
 
 MML_SCHEMA = json.loads(
     pkgutil.get_data('bdbcontrib', 'mml.schema.json'))
@@ -127,3 +129,50 @@ def to_mml(mml_json, table, generator):
                 metamodel=mml_json['metamodel'],
                 subsample='SUBSAMPLE(%d),' % subsample if subsample else '',
                 schema_phrase=schema_phrase))
+
+
+@population_method(population_to_bdb=0, population_name=1)
+def validate_schema(bdb, table, mml_json):
+    """Returns a modified JSON representation of a generator expression,
+    changing the stattypes of any columns which cause issues during analysis
+    to IGNORE.
+
+    This creates a single model for each column and analyzes it for a single
+    iteration. If this succeeds the column and stattype are deemed good. If it
+    fails the stattype is changed to IGNORE and the existing stattype is placed
+    into that column's 'guessed' field, overwriting it if it exists.
+
+    Parameters
+    ----------
+    mml_json
+        A json representation of the generator. Must validate against
+        MML_SCHEMA
+    """
+    bad_cols = []
+    for col, typ in mml_json['columns'].items():
+        # If the column is already ignored there's nothing to check
+        if typ['stattype'] is 'IGNORE':
+            continue
+        one_col_json = copy.deepcopy(mml_json)
+        one_col_json['columns'] = {col: typ}
+        # Create a temp generator
+        gen_name = uuid.uuid4().hex
+        try:
+            bdb.execute(to_mml(one_col_json, table, gen_name))
+            bdb.execute('INITIALIZE 1 MODEL FOR %s'
+                        % (bql_quote_name(gen_name),))
+            bdb.execute('ANALYZE %s FOR 1 ITERATION WAIT'
+                        % (bql_quote_name(gen_name),))
+        except AssertionError:
+            bad_cols.append(col)
+        finally:
+            # Drop our temp generator
+            bdb.execute('DROP GENERATOR %s' % bql_quote_name(gen_name))
+    modified_schema = copy.deepcopy(mml_json)
+    # TODO(asilvers): Should we also return a summary of the modifications?
+    for col in bad_cols:
+        modified_schema['columns'][col]['guessed'] = (
+            modified_schema['columns'][col]['stattype'])
+        modified_schema['columns'][col]['stattype'] = 'IGNORE'
+    jsonschema.validate(mml_json, MML_SCHEMA)
+    return modified_schema
diff --git a/tests/mml.csv b/tests/mml.csv
@@ -1,26 +1,26 @@
-col1,col2,col3,col4,col5
-1,1,,1,YES
-1,2,1,2,NO
-1,1,1,,YES
-1,1,1,4,YES
-1,1,1,5,NO
-1,3,1,6,YES
-1,1,1,7,YES
-1,1,1,8,YES
-1,1,1,9,YES
-1,1,1,10,YES
-1,1,1,11,YES
-1,2,1,12,YES
-1,1,1,13,YES
-1,1,1,14,YES
-1,1,1,15,YES
-1,9,1,16,YES
-1,1,1,17,YES
-1,1,1,18,YES
-1,1,1,19,YES
-1,1,1,20,YES
-1,8,1,21,YES
-1,1,1,22,YES
-1,1,1,23,YES
-1,1,1,24,YES
-1,1,1,25,YES
+col1,col2,col3,col4,col5,col6
+1,1,,1,YES,10000000000
+1,2,1,2,NO,10000000001
+1,1,1,,YES,10000000002
+1,1,1,4,YES,10000000003
+1,1,1,5,NO,10000000004
+1,3,1,6,YES,10000000005
+1,1,1,7,YES,10000000006
+1,1,1,8,YES,10000000007
+1,1,1,9,YES,10000000008
+1,1,1,10,YES,10000000009
+1,1,1,11,YES,10000000010
+1,2,1,12,YES,10000000011
+1,1,1,13,YES,10000000012
+1,1,1,14,YES,10000000013
+1,1,1,15,YES,10000000014
+1,9,1,16,YES,10000000015
+1,1,1,17,YES,10000000016
+1,1,1,18,YES,10000000017
+1,1,1,19,YES,10000000018
+1,1,1,20,YES,10000000019
+1,8,1,21,YES,10000000020
+1,1,1,22,YES,10000000021
+1,1,1,23,YES,10000000022
+1,1,1,24,YES,10000000023
+1,1,1,25,YES,10000000024
diff --git a/tests/test_mml_utils.py b/tests/test_mml_utils.py
@@ -30,7 +30,8 @@ def test_mml_csv():
             'col2': StatType.CATEGORICAL,
             'col3': StatType.IGNORE,
             'col4': StatType.NUMERICAL,
-            'col5': StatType.CATEGORICAL})
+            'col5': StatType.CATEGORICAL,
+            'col6': StatType.NUMERICAL})
 
         mml_json = mml_utils.to_json(guesses)
         assert mml_json == {
@@ -40,10 +41,25 @@ def test_mml_csv():
                 'col2': {'stattype': 'CATEGORICAL'},
                 'col3': {'stattype': 'IGNORE'},
                 'col4': {'stattype': 'NUMERICAL'},
-                'col5': {'stattype': 'CATEGORICAL'}}}
+                'col5': {'stattype': 'CATEGORICAL'},
+                'col6': {'stattype': 'NUMERICAL'}}}
 
         mml_statement = mml_utils.to_mml(mml_json, 'table', 'generator')
         assert mml_statement == (
-            'CREATE GENERATOR generator FOR table '
+            'CREATE GENERATOR "generator" FOR "table" '
             'USING crosscat( '
-            '"col4" NUMERICAL,"col5" CATEGORICAL,"col2" CATEGORICAL);')
+            '"col6" NUMERICAL,"col4" NUMERICAL,"col5" CATEGORICAL,"col2" CATEGORICAL);')
+
+        # col6's values are constructed in such a way as to break crosscat.
+        # See https://github.com/probcomp/bayeslite/issues/284
+        # On validation the column should be ignored
+        mod_schema = mml_utils.validate_schema(bdb, 't', mml_json)
+        assert mod_schema == {
+            'metamodel': 'crosscat',
+            'columns': {
+                u'col1': {'stattype': 'IGNORE'},
+                u'col2': {'stattype': 'CATEGORICAL'},
+                u'col3': {'stattype': 'IGNORE'},
+                u'col4': {'stattype': 'NUMERICAL'},
+                u'col5': {'stattype': 'CATEGORICAL'},
+                u'col6': {'stattype': 'IGNORE', 'guessed': 'NUMERICAL'}}}