Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EL Geometric Models #68

Merged
merged 14 commits into from
Mar 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Deprecated
### Removed
### Fixed
- Fix bug in GCI2 score for ELEmbeddings
- Fix bottleneck in ELBE example for PPI.
- Fix bugs in BoxSquaredEL model.

### Security

## [0.3.0]
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ git clone https://github.com/bio-ontology-research-group/mowl.git

cd mowl

conda env create -f envs/environment_3.8.yml
conda env create -f envs/environment_3_8.yml
conda activate mowl

./build_jars.sh
Expand Down
2 changes: 1 addition & 1 deletion examples/elmodels/plot_1_elembeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
# ":math:`p_1` interacts with :math:`p_2`" is encoded using GCI 2 as:
#
# .. math::
# p_1 \sqsubseteq interacts\_with. p_2
# p_1 \sqsubseteq \exists interacts\_with. p_2
#
# For that, we can use the class :class:`mowl.models.elembeddings.examples.model_ppi.ELEmPPI` mode, which uses the :class:`mowl.datasets.builtin.PPIYeastSlimDataset` dataset.

Expand Down
2 changes: 1 addition & 1 deletion examples/elmodels/plot_2_elboxembeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
# ":math:`p_1` interacts with :math:`p_2`" is encoded using GCI 2 as:
#
# .. math::
# p_1 \sqsubseteq interacts\_with. p_2
# p_1 \sqsubseteq \exists interacts\_with. p_2
#
# For that, we can use the class :class:`mowl.models.elembeddings.examples.model_ppi.ELBoxPPI` mode, which uses the :class:`mowl.datasets.builtin.PPIYeastSlimDataset` dataset.

Expand Down
45 changes: 44 additions & 1 deletion mowl/base_models/elmodel.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from mowl.ontology.normalize import ELNormalizer
from mowl.base_models.model import Model
from mowl.datasets.el import ELDataset
from mowl.projection import projector_factory
import torch as th
from torch.utils.data import DataLoader, default_collate
from mowl.datasets.el import ELDataset

from deprecated.sphinx import versionadded

from org.semanticweb.owlapi.model import OWLClassExpression, OWLClass, OWLObjectSomeValuesFrom, OWLObjectIntersectionOf
Expand Down Expand Up @@ -48,6 +50,7 @@ def __init__(self, dataset, embed_dim, batch_size, extended=True, model_filepath
self._validation_datasets = None
self._testing_datasets = None

self._loaded_eval = False

def init_module(self):
raise NotImplementedError
Expand Down Expand Up @@ -379,3 +382,43 @@ def from_pretrained(self, model):
#self._kge_method = kge_method




def load_pairwise_eval_data(self):

if self._loaded_eval:
return

eval_property = self.dataset.get_evaluation_property()
head_classes, tail_classes = self.dataset.evaluation_classes
self._head_entities = head_classes.as_str
self._tail_entities = tail_classes.as_str

eval_projector = projector_factory('taxonomy_rels', taxonomy=False,
relations=[eval_property])

self._training_set = eval_projector.project(self.dataset.ontology)
self._testing_set = eval_projector.project(self.dataset.testing)

self._loaded_eval = True


@property
def training_set(self):
self.load_pairwise_eval_data()
return self._training_set

@property
def testing_set(self):
self.load_pairwise_eval_data()
return self._testing_set

@property
def head_entities(self):
self.load_pairwise_eval_data()
return self._head_entities

@property
def tail_entities(self):
self.load_pairwise_eval_data()
return self._tail_entities
3 changes: 2 additions & 1 deletion mowl/evaluation/rank_based.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def load_training_scores(self):
c, d = self.head_name_indexemb[c], self.tail_name_indexemb[d]
c, d = self.head_indexemb_indexsc[c], self.tail_indexemb_indexsc[d]

self.training_scores[c, d] = 1000000
self.training_scores[c, d] = 10000

logging.info("Training scores created")
self._loaded_tr_scores = True
Expand Down Expand Up @@ -231,6 +231,7 @@ def activation(x):
print(f'Hits@100: {top100:.2f} Filtered: {ftop100:.2f}')
print(f'MR: {mean_rank:.2f} Filtered: {fmean_rank:.2f}')
print(f'AUC: {rank_auc:.2f} Filtered: {frank_auc:.2f}')
print(f"Tail entities: {num_tail_entities}")

self.metrics = {
"hits@1": top1,
Expand Down
2 changes: 2 additions & 0 deletions mowl/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from mowl.models.elboxembeddings.examples.model_ppi import ELBoxPPI
from mowl.models.elboxembeddings.examples.model_gda import ELBoxGDA

from mowl.models.boxsquaredel.model import BoxSquaredEL

from mowl.models.graph_random_walk.random_walk_w2v_model import RandomWalkPlusW2VModel
from mowl.models.graph_kge.graph_pykeen_model import GraphPlusPyKEENModel
from mowl.models.syntactic.w2v_model import SyntacticPlusW2VModel
Empty file.
130 changes: 130 additions & 0 deletions mowl/models/boxsquaredel/evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
from mowl.evaluation.base import AxiomsRankBasedEvaluator
from mowl.projection.factory import projector_factory
from mowl.projection.edge import Edge
import logging
import numpy as np
from scipy.stats import rankdata
import torch as th


class BoxSquaredELPPIEvaluator(AxiomsRankBasedEvaluator):

def __init__(
self,
axioms,
eval_method,
axioms_to_filter,
class_name_indexemb,
rel_name_indexemb,
device="cpu",
verbose=False
):

super().__init__(axioms, eval_method, axioms_to_filter, device, verbose)

self.class_name_indexemb = class_name_indexemb
self.relation_name_indexemb = rel_name_indexemb

self._loaded_training_scores = False
self._loaded_eval_data = False
self._loaded_ht_data = False

def _load_head_tail_entities(self):
if self._loaded_ht_data:
return

ents, _ = Edge.getEntitiesAndRelations(self.axioms)
ents_filter, _ = Edge.getEntitiesAndRelations(self.axioms_to_filter)

entities = list(set(ents) | set(ents_filter))

self.head_entities = set()
for e in entities:
if e in self.class_name_indexemb:
self.head_entities.add(e)
else:
logging.info("Entity %s not present in the embeddings dictionary. Ignoring it.", e)

self.tail_entities = set()
for e in entities:
if e in self.class_name_indexemb:
self.tail_entities.add(e)
else:
logging.info("Entity %s not present in the embeddings dictionary. Ignoring it.", e)

self.head_name_indexemb = {k: self.class_name_indexemb[k] for k in self.head_entities}
self.tail_name_indexemb = {k: self.class_name_indexemb[k] for k in self.tail_entities}

self.head_indexemb_indexsc = {v: k for k, v in enumerate(self.head_name_indexemb.values())}
self.tail_indexemb_indexsc = {v: k for k, v in enumerate(self.tail_name_indexemb.values())}

self._loaded_ht_data = True

def _load_training_scores(self):
if self._loaded_training_scores:
return self.training_scores

self._load_head_tail_entities()

training_scores = np.ones((len(self.head_entities), len(self.tail_entities)),
dtype=np.int32)

if self._compute_filtered_metrics:
# careful here: c must be in head entities and d must be in tail entities
for axiom in self.axioms_to_filter:
c, _, d = axiom.astuple()
if (c not in self.head_entities) or not (d in self.tail_entities):
continue

c, d = self.head_name_indexemb[c], self.tail_name_indexemb[d]
c, d = self.head_indexemb_indexsc[c], self.tail_indexemb_indexsc[d]

training_scores[c, d] = 10000

logging.info("Training scores created")

self._loaded_training_scores = True
return training_scores

def _init_axioms(self, axioms):

if axioms is None:
return None

projector = projector_factory("taxonomy_rels", relations=["http://interacts_with"])

edges = projector.project(axioms)
return edges # List of Edges

def compute_axiom_rank(self, axiom):

self.training_scores = self._load_training_scores()

c, r, d = axiom.astuple()

if not (c in self.head_entities) or not (d in self.tail_entities):
return None, None, None

# Embedding indices
c_emb_idx, d_emb_idx = self.head_name_indexemb[c], self.tail_name_indexemb[d]

# Scores matrix labels
c_sc_idx, d_sc_idx = self.head_indexemb_indexsc[c_emb_idx],
self.tail_indexemb_indexsc[d_emb_idx]

r = self.relation_name_indexemb[r]

data = th.tensor([
[c_emb_idx, r, self.tail_name_indexemb[x]] for x in
self.tail_entities]).to(self.device)

res = self.eval_method(data).squeeze().cpu().detach().numpy()

# self.testing_predictions[c_sc_idx, :] = res
index = rankdata(res, method='average')
rank = index[d_sc_idx]

findex = rankdata((res * self.training_scores[c_sc_idx, :]), method='average')
frank = findex[d_sc_idx]

return rank, frank, len(self.tail_entities)
77 changes: 77 additions & 0 deletions mowl/models/boxsquaredel/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@

from mowl.nn import BoxSquaredELModule
from mowl.base_models.elmodel import EmbeddingELModel
from mowl.models.boxsquaredel.evaluate import BoxSquaredELPPIEvaluator
import torch as th
from torch import nn


class BoxSquaredEL(EmbeddingELModel):
"""
Implementation based on [peng2020]_.
"""

def __init__(self,
dataset,
embed_dim=50,
margin=0.02,
reg_norm=1,
learning_rate=0.001,
epochs=1000,
batch_size=4096 * 8,
delta=2.5,
reg_factor=0.2,
num_negs=4,
model_filepath=None,
device='cpu'
):
super().__init__(dataset, embed_dim, batch_size, extended=True, model_filepath=model_filepath)


self.margin = margin
self.reg_norm = reg_norm
self.delta = delta
self.reg_factor = reg_factor
self.num_negs = num_negs
self.learning_rate = learning_rate
self.epochs = epochs
self.device = device
self._loaded = False
self.extended = False
self.init_module()

def init_module(self):
self.module = BoxSquaredELModule(
len(self.class_index_dict),
len(self.object_property_index_dict),
embed_dim=self.embed_dim,
gamma=self.margin,
delta=self.delta,
reg_factor=self.reg_factor

).to(self.device)

def train(self):
raise NotImplementedError


def eval_method(self, data):
return self.module.gci2_score(data)

def get_embeddings(self):
self.init_module()

print('Load the best model', self.model_filepath)
self.load_best_model()

ent_embeds = {k: v for k, v in zip(self.class_index_dict.keys(),
self.module.class_embed.weight.cpu().detach().numpy())}
rel_embeds = {k: v for k, v in zip(self.object_property_index_dict.keys(),
self.module.rel_embed.weight.cpu().detach().numpy())}
return ent_embeds, rel_embeds

def load_best_model(self):
self.init_module()
self.module.load_state_dict(th.load(self.model_filepath))
self.module.eval()

Loading
Loading