Skip to content

Commit

Permalink
test nces, nces2, roces working.
Browse files Browse the repository at this point in the history
  • Loading branch information
Jean-KOUAGOU committed Jan 7, 2025
1 parent ede1978 commit 97a7174
Show file tree
Hide file tree
Showing 7 changed files with 44 additions and 39 deletions.
20 changes: 14 additions & 6 deletions examples/train_nces.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import argparse
import json, os
from ontolearn.concept_learner import NCES
from ontolearn.concept_learner import NCES, NCES2, ROCES
from transformers import set_seed

def str2bool(v):
Expand Down Expand Up @@ -35,19 +35,27 @@ def start(args):
assert isinstance(training_data, list), "The training data must either be stored as a dictionary ({'expr': {'positive examples': [], 'negative examples': []}, ...,}) or a list of items"
except FileNotFoundError:
print("Couldn't find training data in the specified path. Defaulting to generating training data.")
synthesizer = NCES(knowledge_base_path=args.kb, learner_names=['SetTransformer', 'GRU', 'LSTM'], path_of_embeddings=args.path_of_embeddings, auto_train=False,
max_length=48, proj_dim=128, rnn_n_layers=2, drop_prob=0.1, num_heads=4, num_seeds=1, m=32, load_pretrained=args.load_pretrained, verbose=True)
synthesizer.train(training_data, epochs=args.epochs)
if args.synthesizer == "NCES":
synthesizer = NCES(knowledge_base_path=args.kb, learner_names=['SetTransformer', 'GRU', 'LSTM'], path_of_embeddings=args.path_of_embeddings, auto_train=False,
max_length=48, proj_dim=128, rnn_n_layers=2, drop_prob=0.1, num_heads=4, num_seeds=1, m=32, load_pretrained=args.load_pretrained, verbose=True)
elif args.synthesizer == "NCES2":
synthesizer = NCES2(knowledge_base_path=args.kb, auto_train=False, max_length=48, proj_dim=128,
drop_prob=0.1, num_heads=4, num_seeds=1, m=32, load_pretrained=args.load_pretrained, verbose=True)
else:
synthesizer = ROCES(knowledge_base_path=args.kb, auto_train=False, k=5, max_length=48, proj_dim=128,
drop_prob=0.1, num_heads=4, num_seeds=1, m=32, load_pretrained=args.load_pretrained, verbose=True)
synthesizer.train(training_data, epochs=args.epochs, max_num_lps=args.max_num_lps, refinement_expressivity=args.refinement_expressivity)
print(synthesizer)

if __name__ == '__main__':
set_seed(42)
parser = argparse.ArgumentParser()
parser.add_argument('--kb', type=str, default=None, help='Paths of a knowledge base (OWL file)')
parser.add_argument('--synthesizer', type=str, default="ROCES", help='Name of the neural synthesizer')
parser.add_argument('--refinement_expressivity', type=float, default=0.9, help='The expressivity of the refinement operator during training data generation')
parser.add_argument('--max_num_lps', type=int, default=20000, help='Maximum number of learning problems to generate if no training data is provided')
parser.add_argument('--path_of_embeddings', type=str, default=None, help='Path to a csv file containing embeddings for the KB.')
parser.add_argument('--path_train_data', type=str, help='Path to training data')
parser.add_argument('--num_lps', type=int, default=10000, help='Maximum number of learning problems to generate if no training data is provided')
parser.add_argument('--epochs', type=int, default=50, help='Number of training epochs')
parser.add_argument('--load_pretrained', type=str2bool, default=True, help='Whether to load the pretrained model')
parser.add_argument('--load_pretrained', type=str2bool, default=False, help='Whether to load the pretrained model')
start(parser.parse_args())
44 changes: 23 additions & 21 deletions ontolearn/concept_learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -819,7 +819,7 @@ def _rename_individuals(individual_name):
return individual_name.split('/')[-1]
return individual_name

if self.path_of_embeddings is None or (os.path.idir(self.path_of_embeddings) and not glob.glob(self.path_of_embeddings+'*_entity_embeddings.csv')) or not self.path_of_embeddings.endswith('.csv'):
if self.path_of_embeddings is None or (os.path.isdir(self.path_of_embeddings) and not glob.glob(self.path_of_embeddings+'*_entity_embeddings.csv')) or not self.path_of_embeddings.endswith('.csv'):
if not os.path.exists(self.knowledge_base_path):
raise ValueError(f"{knowledge_base_path} not found")
try:
Expand All @@ -829,11 +829,13 @@ def _rename_individuals(individual_name):
except Exception:
print('\x1b[0;30;43m dicee is not installed, will first install it\x1b[0m\n')
subprocess.run('pip install dicee==0.1.4')
print("\n"+"\x1b[0;30;43m"+"Embeddings not found. Will quickly train embeddings beforehand"+"Poor performance is expected as we will also train the synthesizer for a few epochs.\
For maximum performance, use pretrained models or train embeddings for many epochs, and the neural synthesizer on massive amounts of data and for many epochs\x1b[0m"+"\n")
subprocess.run(f"dicee --path_single_kg {self.knowledge_base_path} --path_to_store_single_run temp_embeddings --backend rdflib --save_embeddings_as_csv --num_epochs 20 --model DeCaL",
shell = True, executable="/bin/bash")
assert os.path.exists("./temp_embeddings/DeCaL_entity_embeddings.csv"), "It seems that embeddings were not stored at the expected directory (/temp_embeddings/DeCaL_entity_embeddings.csv)"
print("\n"+"\x1b[0;30;43m"+"Embeddings not found. Will quickly train embeddings beforehand. "+"Poor performance is expected as we will also train the synthesizer for a few epochs.\nFor maximum performance, use pretrained models or train embeddings for many epochs, and the neural synthesizer on massive amounts of data and for many epochs\x1b[0m"+"\n")
try:
subprocess.run(f"dicee --path_single_kg {self.knowledge_base_path} --path_to_store_single_run temp_embeddings --backend rdflib --save_embeddings_as_csv --num_epochs 20 --model DeCaL",
shell = True, executable="/bin/bash")
assert os.path.exists("./temp_embeddings/DeCaL_entity_embeddings.csv"), "It seems that embeddings were not stored at the expected directory (/temp_embeddings/DeCaL_entity_embeddings.csv)"
except Exception:
raise ValueError("\nPlease try providing the absolute path to the knowledge base, e.g., /home/ndah/Dev/Ontolean/KGs/Family/family-benchmark_rich_background.owl\n")
self.path_of_embeddings = "./temp_embeddings/DeCaL_entity_embeddings.csv"
if self.auto_train:
print("\n"+"\x1b[0;30;43m"+f"Will also generate some training data and train {self.name} for 5 epochs"+"\x1b[0m"+"\n")
Expand Down Expand Up @@ -1117,8 +1119,8 @@ def fit_from_iterable(self, dataset: Union[List[Tuple[str, Set[OWLNamedIndividua
return predictions_as_owl_class_expressions

@staticmethod
def generate_training_data(kb_path, num_lps=1000, storage_path="./NCES_Training_Data"):
lp_gen = LPGen(kb_path=kb_path, max_num_lps=num_lps, storage_path=storage_path)
def generate_training_data(kb_path, max_num_lps=1000, refinement_expressivity=0.2, storage_path="./NCES_Training_Data"):
lp_gen = LPGen(kb_path=kb_path, max_num_lps=max_num_lps, refinement_expressivity=refinement_expressivity, storage_path=storage_path)
lp_gen.generate()
print("Loading generated data...")
with open(f"{storage_path}/LPs.json") as file:
Expand All @@ -1129,7 +1131,7 @@ def generate_training_data(kb_path, num_lps=1000, storage_path="./NCES_Training_
return lps


def train(self, data: Iterable[List[Tuple]]=None, epochs=50, batch_size=64, num_lps=1000, learning_rate=1e-4, decay_rate=0.0,
def train(self, data: Iterable[List[Tuple]]=None, epochs=50, batch_size=64, max_num_lps=1000, refinement_expressivity=0.2, learning_rate=1e-4, decay_rate=0.0,
clip_value=5.0, num_workers=8, save_model=True, storage_path=None, optimizer='Adam', record_runtime=True,
example_sizes=None, shuffle_examples=False):
if os.cpu_count() <= num_workers:
Expand All @@ -1143,7 +1145,7 @@ def train(self, data: Iterable[List[Tuple]]=None, epochs=50, batch_size=64, num_
if batch_size is None:
batch_size = self.batch_size
if data is None:
data = self.generate_training_data(self.knowledge_base_path, num_lps=num_lps, storage_path=storage_path)
data = self.generate_training_data(self.knowledge_base_path, max_num_lps=max_num_lps, refinement_expressivity=refinement_expressivity, storage_path=storage_path)

trainer = NCESTrainer(self, epochs=epochs, batch_size=batch_size, learning_rate=learning_rate, decay_rate=decay_rate,
clip_value=clip_value, num_workers=num_workers, storage_path=storage_path)
Expand Down Expand Up @@ -1351,7 +1353,7 @@ def fit_one(self, pos: Union[Set[OWLNamedIndividual], Set[str]], neg: Union[Set[
dataloaders = []
for num_ind_points in self.model:
dataset = ROCESDatasetInference([("", pos_str, neg_str)],
triples_data=self.triples_data, k=None,
triples_data=self.triples_data, k=self.k if hasattr(self, "k") else None,
vocab=self.vocab, inv_vocab=self.inv_vocab,
max_length=self.max_length, num_examples=self.num_examples,
sampling_strategy=self.sampling_strategy,
Expand Down Expand Up @@ -1447,9 +1449,9 @@ def fit_from_iterable(self, data: Union[List[Tuple[str, Set[OWLNamedIndividual],
dataloaders = []
for num_ind_points in self.model:
dataset = ROCESDatasetInference(data,
self.triples_data,
self.vocab, self.inv_vocab,
self.max_length, self.num_examples,
self.triples_data, k=self.k if hasattr(self, "k") else None,
vocab=self.vocab, inv_vocab=self.inv_vocab,
max_length=self.max_length, num_examples=self.num_examples,
sampling_strategy=self.sampling_strategy,
num_pred_per_lp=self.num_predictions)
dataset.load_embeddings(self.model[num_ind_points]["emb_model"])
Expand Down Expand Up @@ -1477,10 +1479,10 @@ def fit_from_iterable(self, data: Union[List[Tuple[str, Set[OWLNamedIndividual],
return predictions_as_owl_class_expressions

@staticmethod
def generate_training_data(kb_path, num_lps=1000, beyond_alc=False, storage_path=None):
def generate_training_data(kb_path, max_num_lps=1000, refinement_expressivity=0.2, beyond_alc=False, storage_path=None):
if storage_path is None:
storage_path = f"./Training_Data_{self.name}"
lp_gen = LPGen(kb_path=kb_path, max_num_lps=num_lps, beyond_alc=beyond_alc, storage_path=storage_path)
lp_gen = LPGen(kb_path=kb_path, max_num_lps=max_num_lps, refinement_expressivity=refinement_expressivity, beyond_alc=beyond_alc, storage_path=storage_path)
lp_gen.generate()
print("Loading generated data...")
with open(f"{storage_path}/LPs.json") as file:
Expand All @@ -1491,7 +1493,7 @@ def generate_training_data(kb_path, num_lps=1000, beyond_alc=False, storage_path
return lps


def train(self, data: Iterable[List[Tuple]]=None, epochs=50, batch_size=64, num_lps=1000, learning_rate=1e-4, decay_rate=0.0,
def train(self, data: Iterable[List[Tuple]]=None, epochs=50, batch_size=64, max_num_lps=1000, refinement_expressivity=0.2, learning_rate=1e-4, decay_rate=0.0,
clip_value=5.0, num_workers=8, save_model=True, storage_path=None, optimizer='Adam', record_runtime=True, shuffle_examples=False):
if os.cpu_count() <= num_workers:
num_workers = max(0,os.cpu_count()-1)
Expand All @@ -1503,11 +1505,11 @@ def train(self, data: Iterable[List[Tuple]]=None, epochs=50, batch_size=64, num_
if batch_size is None:
batch_size = self.batch_size
if data is None:
data = self.generate_training_data(self.knowledge_base_path, num_lps=num_lps, beyond_alc=True, storage_path=storage_path)
data = self.generate_training_data(self.knowledge_base_path, max_num_lps=max_num_lps, refinement_expressivity=refinement_expressivity, beyond_alc=True, storage_path=storage_path)
vocab_size_before = len(self.vocab)
self.add_data_values(data) # Add data values based on training data
self.path_of_trained_models = storage_path+"/trained_models"
if len(self.vocab) > vocab_size_before and self.load_pretrained:
if len(self.vocab) > vocab_size_before:
self.model = self.get_synthesizer(verbose=False)
trainer = NCESTrainer(self, epochs=epochs, batch_size=batch_size, learning_rate=learning_rate, decay_rate=decay_rate,
clip_value=clip_value, num_workers=num_workers, storage_path=storage_path)
Expand All @@ -1525,11 +1527,11 @@ def __init__(self, knowledge_base_path, nces2_or_roces=True,
input_dropout=0.0, feature_map_dropout=0.1, kernel_size=4, num_of_output_channels=32,
learning_rate=1e-4, decay_rate=0.0, clip_value=5.0, batch_size=256, num_workers=4,
max_length=48, load_pretrained=True, verbose: int = 0):

self.k = k
super().__init__(knowledge_base_path, nces2_or_roces,
quality_func, num_predictions, path_of_trained_models, auto_train, proj_dim, drop_prob,
num_heads, num_seeds, m, ln, embedding_dim, sampling_strategy, input_dropout, feature_map_dropout,
kernel_size, num_of_output_channels, learning_rate, decay_rate, clip_value, batch_size,
num_workers, max_length, load_pretrained, verbose)

self.k = k

4 changes: 2 additions & 2 deletions ontolearn/data_struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,7 @@ def __init__(self, data, triples_data, k, vocab, inv_vocab, max_length, sampling

def load_embeddings(self, embedding_model):
embeddings, _ = embedding_model.get_embeddings()
self.embeddings = embeddings.cpu()
self.embeddings = embeddings.detach().cpu()


def set_k(self, k):
Expand Down Expand Up @@ -451,7 +451,7 @@ def __init__(self, data, triples_data, k, vocab, inv_vocab, max_length, num_exam

def load_embeddings(self, embedding_model):
embeddings, _ = embedding_model.get_embeddings()
self.embeddings = embeddings.cpu()
self.embeddings = embeddings.detach().cpu()

def set_k(self, k):
self.k = k
Expand Down
6 changes: 1 addition & 5 deletions tests/test_lp_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,7 @@ class LPGen_Test(unittest.TestCase):
def test_generate_load(self):
lp_gen = LPGen(kb_path=PATH_FAMILY, storage_path=STORAGE_PATH)
lp_gen.generate()
print("Loading generated data...")
with open(f"{STORAGE_DIR}/triples/train.txt") as file:
triples_data = file.readlines()
print("Number of triples:", len(triples_data))
with open(f"{STORAGE_DIR}/LPs.json") as file:
with open(f"{STORAGE_PATH}/LPs.json") as file:
lps = json.load(file)
print("Number of learning problems:", len(lps))
self.assertGreaterEqual(lp_gen.lp_gen.max_num_lps, len(lps))
Expand Down
3 changes: 1 addition & 2 deletions tests/test_nces.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,7 @@ class TestNCES(unittest.TestCase):

def test_prediction_quality_family(self):
knowledge_base_path = base_path[:base_path.rfind("/")+1] + "KGs/Family/family-benchmark_rich_background.owl"
model = NCES(knowledge_base_path=knowledge_base_path, learner_names=['SetTransformer'], path_of_embeddings=None,
max_length=48, proj_dim=128, rnn_n_layers=2, drop_prob=0.1, num_heads=4, num_seeds=1, m=32, load_pretrained=True, verbose=True)
model = NCES(knowledge_base_path=knowledge_base_path, learner_names=['SetTransformer'], max_length=48, proj_dim=128, rnn_n_layers=2, drop_prob=0.1, num_heads=4, num_seeds=1, m=32, load_pretrained=True, verbose=True)
KB = KnowledgeBase(path=model.knowledge_base_path)
dl_parser = DLSyntaxParser(model.kb_namespace)
brother = dl_parser.parse('Brother')
Expand Down
2 changes: 1 addition & 1 deletion tests/test_nces2.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class TestNCES2(unittest.TestCase):

def test_prediction_quality_family(self):
knowledge_base_path = base_path[:base_path.rfind("/")+1] + "KGs/Family/family-benchmark_rich_background.owl"
model = NCES2(knowledge_base_path=knowledge_base_path, max_length=48, proj_dim=128, drop_prob=0.1,
model = NCES2(knowledge_base_path=knowledge_base_path, max_length=48, proj_dim=128, drop_prob=0.1,
num_heads=4, num_seeds=1, m=32, load_pretrained=True, verbose=True)
KB = KnowledgeBase(path=model.knowledge_base_path)
dl_parser = DLSyntaxParser(model.kb_namespace)
Expand Down
4 changes: 2 additions & 2 deletions tests/test_roces.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,11 @@ def seed_everything():

base_path = pathlib.Path(__file__).parent.resolve()._str

class TestNCES2(unittest.TestCase):
class TestROCES(unittest.TestCase):

def test_prediction_quality_family(self):
knowledge_base_path = base_path[:base_path.rfind("/")+1] + "KGs/Family/family-benchmark_rich_background.owl"
model = ROCES(knowledge_base_path=knowledge_base_path, max_length=48, proj_dim=128, drop_prob=0.1,
model = ROCES(knowledge_base_path=knowledge_base_path, k=5, max_length=48, proj_dim=128, drop_prob=0.1,
num_heads=4, num_seeds=1, m=32, load_pretrained=True, verbose=True)
KB = KnowledgeBase(path=model.knowledge_base_path)
dl_parser = DLSyntaxParser(model.kb_namespace)
Expand Down

0 comments on commit 97a7174

Please sign in to comment.