diff --git a/src/mml_utils.py b/src/mml_utils.py index c712959..79b53af 100644 --- a/src/mml_utils.py +++ b/src/mml_utils.py @@ -3,9 +3,11 @@ from enum import Enum from string import Template +import copy import json import jsonschema import pkgutil +import uuid MML_SCHEMA = json.loads( pkgutil.get_data('bdbcontrib', 'mml.schema.json')) @@ -127,3 +129,50 @@ def to_mml(mml_json, table, generator): metamodel=mml_json['metamodel'], subsample='SUBSAMPLE(%d),' % subsample if subsample else '', schema_phrase=schema_phrase)) + + +@population_method(population_to_bdb=0, population_name=1) +def validate_schema(bdb, table, mml_json): + """Returns a modified JSON representation of a generator expression, + changing the stattypes of any columns which cause issues during analysis + to IGNORE. + + This creates a single model for each column and analyzes it for a single + iteration. If this succeeds the column and stattype are deemed good. If it + fails the stattype is changed to IGNORE and the existing stattype is placed + into that column's 'guessed' field, overwriting it if it exists. + + Parameters + ---------- + mml_json + A json representation of the generator. Must validate against + MML_SCHEMA + """ + bad_cols = [] + for col, typ in mml_json['columns'].items(): + # If the column is already ignored there's nothing to check + if typ['stattype'] is 'IGNORE': + continue + one_col_json = copy.deepcopy(mml_json) + one_col_json['columns'] = {col: typ} + # Create a temp generator + gen_name = uuid.uuid4().hex + try: + bdb.execute(to_mml(one_col_json, table, gen_name)) + bdb.execute('INITIALIZE 1 MODEL FOR %s' + % (bql_quote_name(gen_name),)) + bdb.execute('ANALYZE %s FOR 1 ITERATION WAIT' + % (bql_quote_name(gen_name),)) + except AssertionError: + bad_cols.append(col) + finally: + # Drop our temp generator + bdb.execute('DROP GENERATOR %s' % bql_quote_name(gen_name)) + modified_schema = copy.deepcopy(mml_json) + # TODO(asilvers): Should we also return a summary of the modifications? + for col in bad_cols: + modified_schema['columns'][col]['guessed'] = ( + modified_schema['columns'][col]['stattype']) + modified_schema['columns'][col]['stattype'] = 'IGNORE' + jsonschema.validate(mml_json, MML_SCHEMA) + return modified_schema diff --git a/tests/mml.csv b/tests/mml.csv index 5e1a094..0b79f2c 100644 --- a/tests/mml.csv +++ b/tests/mml.csv @@ -1,26 +1,26 @@ -col1,col2,col3,col4,col5 -1,1,,1,YES -1,2,1,2,NO -1,1,1,,YES -1,1,1,4,YES -1,1,1,5,NO -1,3,1,6,YES -1,1,1,7,YES -1,1,1,8,YES -1,1,1,9,YES -1,1,1,10,YES -1,1,1,11,YES -1,2,1,12,YES -1,1,1,13,YES -1,1,1,14,YES -1,1,1,15,YES -1,9,1,16,YES -1,1,1,17,YES -1,1,1,18,YES -1,1,1,19,YES -1,1,1,20,YES -1,8,1,21,YES -1,1,1,22,YES -1,1,1,23,YES -1,1,1,24,YES -1,1,1,25,YES +col1,col2,col3,col4,col5,col6 +1,1,,1,YES,10000000000 +1,2,1,2,NO,10000000001 +1,1,1,,YES,10000000002 +1,1,1,4,YES,10000000003 +1,1,1,5,NO,10000000004 +1,3,1,6,YES,10000000005 +1,1,1,7,YES,10000000006 +1,1,1,8,YES,10000000007 +1,1,1,9,YES,10000000008 +1,1,1,10,YES,10000000009 +1,1,1,11,YES,10000000010 +1,2,1,12,YES,10000000011 +1,1,1,13,YES,10000000012 +1,1,1,14,YES,10000000013 +1,1,1,15,YES,10000000014 +1,9,1,16,YES,10000000015 +1,1,1,17,YES,10000000016 +1,1,1,18,YES,10000000017 +1,1,1,19,YES,10000000018 +1,1,1,20,YES,10000000019 +1,8,1,21,YES,10000000020 +1,1,1,22,YES,10000000021 +1,1,1,23,YES,10000000022 +1,1,1,24,YES,10000000023 +1,1,1,25,YES,10000000024 diff --git a/tests/test_mml_utils.py b/tests/test_mml_utils.py index d75ca79..9c31962 100644 --- a/tests/test_mml_utils.py +++ b/tests/test_mml_utils.py @@ -30,7 +30,8 @@ def test_mml_csv(): 'col2': StatType.CATEGORICAL, 'col3': StatType.IGNORE, 'col4': StatType.NUMERICAL, - 'col5': StatType.CATEGORICAL}) + 'col5': StatType.CATEGORICAL, + 'col6': StatType.NUMERICAL}) mml_json = mml_utils.to_json(guesses) assert mml_json == { @@ -40,10 +41,25 @@ def test_mml_csv(): 'col2': {'stattype': 'CATEGORICAL'}, 'col3': {'stattype': 'IGNORE'}, 'col4': {'stattype': 'NUMERICAL'}, - 'col5': {'stattype': 'CATEGORICAL'}}} + 'col5': {'stattype': 'CATEGORICAL'}, + 'col6': {'stattype': 'NUMERICAL'}}} mml_statement = mml_utils.to_mml(mml_json, 'table', 'generator') assert mml_statement == ( - 'CREATE GENERATOR generator FOR table ' + 'CREATE GENERATOR "generator" FOR "table" ' 'USING crosscat( ' - '"col4" NUMERICAL,"col5" CATEGORICAL,"col2" CATEGORICAL);') + '"col6" NUMERICAL,"col4" NUMERICAL,"col5" CATEGORICAL,"col2" CATEGORICAL);') + + # col6's values are constructed in such a way as to break crosscat. + # See https://github.com/probcomp/bayeslite/issues/284 + # On validation the column should be ignored + mod_schema = mml_utils.validate_schema(bdb, 't', mml_json) + assert mod_schema == { + 'metamodel': 'crosscat', + 'columns': { + u'col1': {'stattype': 'IGNORE'}, + u'col2': {'stattype': 'CATEGORICAL'}, + u'col3': {'stattype': 'IGNORE'}, + u'col4': {'stattype': 'NUMERICAL'}, + u'col5': {'stattype': 'CATEGORICAL'}, + u'col6': {'stattype': 'IGNORE', 'guessed': 'NUMERICAL'}}}