Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a validation method to mml_utils #154

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions src/mml_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
from enum import Enum
from string import Template

import copy
import json
import jsonschema
import pkgutil
import uuid

MML_SCHEMA = json.loads(
pkgutil.get_data('bdbcontrib', 'mml.schema.json'))
Expand Down Expand Up @@ -127,3 +129,50 @@ def to_mml(mml_json, table, generator):
metamodel=mml_json['metamodel'],
subsample='SUBSAMPLE(%d),' % subsample if subsample else '',
schema_phrase=schema_phrase))


@population_method(population_to_bdb=0, population_name=1)
def validate_schema(bdb, table, mml_json):
"""Returns a modified JSON representation of a generator expression,
changing the stattypes of any columns which cause issues during analysis
to IGNORE.
This creates a single model for each column and analyzes it for a single
iteration. If this succeeds the column and stattype are deemed good. If it
fails the stattype is changed to IGNORE and the existing stattype is placed
into that column's 'guessed' field, overwriting it if it exists.
Parameters
----------
mml_json
A json representation of the generator. Must validate against
MML_SCHEMA
"""
bad_cols = []
for col, typ in mml_json['columns'].items():
# If the column is already ignored there's nothing to check
if typ['stattype'] is 'IGNORE':
continue
one_col_json = copy.deepcopy(mml_json)
one_col_json['columns'] = {col: typ}
# Create a temp generator
gen_name = uuid.uuid4().hex
try:
bdb.execute(to_mml(one_col_json, table, gen_name))
bdb.execute('INITIALIZE 1 MODEL FOR %s'
% (bql_quote_name(gen_name),))
bdb.execute('ANALYZE %s FOR 1 ITERATION WAIT'
% (bql_quote_name(gen_name),))
except AssertionError:
bad_cols.append(col)
finally:
# Drop our temp generator
bdb.execute('DROP GENERATOR %s' % bql_quote_name(gen_name))
modified_schema = copy.deepcopy(mml_json)
# TODO(asilvers): Should we also return a summary of the modifications?
for col in bad_cols:
modified_schema['columns'][col]['guessed'] = (
modified_schema['columns'][col]['stattype'])
modified_schema['columns'][col]['stattype'] = 'IGNORE'
jsonschema.validate(mml_json, MML_SCHEMA)
return modified_schema
52 changes: 26 additions & 26 deletions tests/mml.csv
Original file line number Diff line number Diff line change
@@ -1,26 +1,26 @@
col1,col2,col3,col4,col5
1,1,,1,YES
1,2,1,2,NO
1,1,1,,YES
1,1,1,4,YES
1,1,1,5,NO
1,3,1,6,YES
1,1,1,7,YES
1,1,1,8,YES
1,1,1,9,YES
1,1,1,10,YES
1,1,1,11,YES
1,2,1,12,YES
1,1,1,13,YES
1,1,1,14,YES
1,1,1,15,YES
1,9,1,16,YES
1,1,1,17,YES
1,1,1,18,YES
1,1,1,19,YES
1,1,1,20,YES
1,8,1,21,YES
1,1,1,22,YES
1,1,1,23,YES
1,1,1,24,YES
1,1,1,25,YES
col1,col2,col3,col4,col5,col6
1,1,,1,YES,10000000000
1,2,1,2,NO,10000000001
1,1,1,,YES,10000000002
1,1,1,4,YES,10000000003
1,1,1,5,NO,10000000004
1,3,1,6,YES,10000000005
1,1,1,7,YES,10000000006
1,1,1,8,YES,10000000007
1,1,1,9,YES,10000000008
1,1,1,10,YES,10000000009
1,1,1,11,YES,10000000010
1,2,1,12,YES,10000000011
1,1,1,13,YES,10000000012
1,1,1,14,YES,10000000013
1,1,1,15,YES,10000000014
1,9,1,16,YES,10000000015
1,1,1,17,YES,10000000016
1,1,1,18,YES,10000000017
1,1,1,19,YES,10000000018
1,1,1,20,YES,10000000019
1,8,1,21,YES,10000000020
1,1,1,22,YES,10000000021
1,1,1,23,YES,10000000022
1,1,1,24,YES,10000000023
1,1,1,25,YES,10000000024
24 changes: 20 additions & 4 deletions tests/test_mml_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ def test_mml_csv():
'col2': StatType.CATEGORICAL,
'col3': StatType.IGNORE,
'col4': StatType.NUMERICAL,
'col5': StatType.CATEGORICAL})
'col5': StatType.CATEGORICAL,
'col6': StatType.NUMERICAL})

mml_json = mml_utils.to_json(guesses)
assert mml_json == {
Expand All @@ -40,10 +41,25 @@ def test_mml_csv():
'col2': {'stattype': 'CATEGORICAL'},
'col3': {'stattype': 'IGNORE'},
'col4': {'stattype': 'NUMERICAL'},
'col5': {'stattype': 'CATEGORICAL'}}}
'col5': {'stattype': 'CATEGORICAL'},
'col6': {'stattype': 'NUMERICAL'}}}

mml_statement = mml_utils.to_mml(mml_json, 'table', 'generator')
assert mml_statement == (
'CREATE GENERATOR generator FOR table '
'CREATE GENERATOR "generator" FOR "table" '
'USING crosscat( '
'"col4" NUMERICAL,"col5" CATEGORICAL,"col2" CATEGORICAL);')
'"col6" NUMERICAL,"col4" NUMERICAL,"col5" CATEGORICAL,"col2" CATEGORICAL);')

# col6's values are constructed in such a way as to break crosscat.
# See https://github.com/probcomp/bayeslite/issues/284
# On validation the column should be ignored
mod_schema = mml_utils.validate_schema(bdb, 't', mml_json)
assert mod_schema == {
'metamodel': 'crosscat',
'columns': {
u'col1': {'stattype': 'IGNORE'},
u'col2': {'stattype': 'CATEGORICAL'},
u'col3': {'stattype': 'IGNORE'},
u'col4': {'stattype': 'NUMERICAL'},
u'col5': {'stattype': 'CATEGORICAL'},
u'col6': {'stattype': 'IGNORE', 'guessed': 'NUMERICAL'}}}