diff --git a/extraction/attributeclassification/actionclassification.py b/extraction/attributeclassification/actionclassification.py index a07dbde659887eecb0afb06bc88f4ad2aa8d5697..3add9f9caaa5b91dd8a23963ebc7c2479354d690 100644 --- a/extraction/attributeclassification/actionclassification.py +++ b/extraction/attributeclassification/actionclassification.py @@ -9,10 +9,15 @@ from nltk.corpus import words class ActionClassifier: - def __init__(self, config, aug_log, embeddings): + def __init__(self, config, aug_log=None, embeddings=None, actions=None): self.config = config self.aug_log = aug_log - self.actions = list([act for act in self.aug_log.get_all_unique_values_for_role(ConceptType.ACTION_NAME.value) if any(tok in words.words() for tok in act.split(" "))]) + if aug_log is not None: + self.actions = list([act for act in self.aug_log.get_all_unique_values_for_role(ConceptType.ACTION_NAME.value) if any(tok in words.words() for tok in act.split(" "))]) + elif actions is not None: + self.actions = actions + else: + raise Exception("Either aug_log or actions must be provided") self.embeddings = embeddings self.lemmatizer = WordNetLemmatizer() with open(self.config.resource_dir / 'mitphb.json') as json_file: diff --git a/extraction/attributeclassification/simactionclassifier.py b/extraction/attributeclassification/simactionclassifier.py new file mode 100644 index 0000000000000000000000000000000000000000..160eb349197f6c9f431802bcc2815d7c1d8ee82c --- /dev/null +++ b/extraction/attributeclassification/simactionclassifier.py @@ -0,0 +1,82 @@ +import operator + +from nltk import WordNetLemmatizer + + +class SlimActionClassifier: + + def __init__(self, embeddings, action_taxonomy): + self.embeddings = embeddings + self.lemmatizer = WordNetLemmatizer() + action_taxonomy = action_taxonomy + # all unique actions + unique_actions_taxonomy = set() + # a mapping from all unique actions to their top most ancestor(s) + child_to_upper_level = dict() + # all upper level actions + upper_acts = set() + self.unique_actions_from_taxonomy(action_taxonomy, unique_actions_taxonomy, child_to_upper_level, + upper_acts) + self.unique_actions_taxonomy = unique_actions_taxonomy + self.child_to_upper_level=child_to_upper_level + self.upper_acts = upper_acts + + def classify_actions(self, actions): + return {act: self.get_action_type_for_action(act) for act in actions} + + def classify_action(self, action): + return self.get_action_type_for_action(action) + + def unique_actions_from_taxonomy(self, action_taxonomy, unique_actions, child_to_upper_level, upper_acts, upper_level=None): + for act, children in action_taxonomy.items(): + unique_actions.add(act) + if upper_level is None: + child_to_upper_level[act] = {act} + upper_acts.add(act) + ul = act + else: + if act in child_to_upper_level: + child_to_upper_level[act].add(upper_level) + else: + child_to_upper_level[act] = {upper_level} + ul = upper_level + for child in children: + self.unique_actions_from_taxonomy(child, unique_actions, child_to_upper_level, upper_acts, upper_level=ul) + + def get_most_similar(self, action, taxonomy_actions, child_to_upper_level, upper_acts): + if len(action) < 3: + return "None" + sims = {} + upper_level_sims = {} + for tax_action in taxonomy_actions: + try: + sim = self.embeddings.embeddings.similarity(action, tax_action) + if tax_action in upper_acts: + upper_level_sims[tax_action] = sim + sims[tax_action] = sim + except KeyError as e: + action = self.lemmatizer.lemmatize((action.split(" ")[-1])) + try: + sim = self.embeddings.embeddings.similarity(action, tax_action) + if tax_action in upper_acts: + upper_level_sims[tax_action] = sim + sims[tax_action] = sim + except KeyError as e: + pass + if len(sims) == 0: + return "None" + max_sim = max(sims.items(), key=operator.itemgetter(1))[0] + max_sim_upper = max(upper_level_sims.items(), key=operator.itemgetter(1))[0] + max_sim_upper_ini = str(max_sim_upper) + if len(child_to_upper_level[max_sim]) == 1: + max_sim = list(child_to_upper_level[max_sim])[0] + else: + max_sim_upper = -1 + for upper_level_act in child_to_upper_level[max_sim]: + if upper_level_sims[upper_level_act] > max_sim_upper: + max_sim = upper_level_act + max_sim_upper = upper_level_sims[upper_level_act] + return max_sim if sims[max_sim] > 0 else max_sim_upper_ini + + def get_action_type_for_action(self, action): + return self.get_most_similar(action, self.unique_actions_taxonomy,self.child_to_upper_level, self.upper_acts) diff --git a/extraction/data/word_embeddings.py b/extraction/data/word_embeddings.py index a2a509befda302962568442589d2f81d050d8391..592dd01f7b64c55514dc911594a5bf5c50a9a74d 100644 --- a/extraction/data/word_embeddings.py +++ b/extraction/data/word_embeddings.py @@ -3,6 +3,5 @@ import gensim.downloader as api class WordEmbeddings: - def __init__(self, config): - self.config = config - self.embeddings = api.load(self.config.word_embeddings_file) + def __init__(self, word_embeddings_file): + self.embeddings = api.load(word_embeddings_file) diff --git a/extraction/extract.py b/extraction/extract.py index 5701770b3b05e0365f444731af54d143ff7095da..dc64fcd6a86aa23d076441d968abdf886b63ddd0 100644 --- a/extraction/extract.py +++ b/extraction/extract.py @@ -1,3 +1,4 @@ +import json import logging from pm4py.objects.log.obj import EventLog @@ -8,6 +9,7 @@ import time from extraction.attributeclassification.actionclassification import ActionClassifier from extraction.attributeclassification.resourceclassifier import ResourceClassifier +from extraction.attributeclassification.simactionclassifier import SlimActionClassifier from extraction.data.word_embeddings import WordEmbeddings from extraction.attributeclassification.attribute_classification import AttributeClassifier import extraction.preprocessing.preprocessor as pp @@ -28,6 +30,7 @@ _model_folder = "model" _glove_model = "glove.6B.100d.txt" _spacy_model = "en_core_web_lg" _bert_model = "arebmann/model" +_we_file = "glove-wiki-gigaword-50" def get_instance(): @@ -55,13 +58,16 @@ def _load_models(): return bt bert = _load_bert() - return nlp_util, bert, kb + word_embeddings = WordEmbeddings((load_model_from_package(_module_name) / _model_folder)) + with open(load_model_from_package(_module_name) / _model_folder / 'mitphb.json') as json_file: + action_taxonomy = json.load(json_file) + return nlp_util, bert, kb, word_embeddings, action_taxonomy class Extraction: def __init__(self): - self._nlp_util, self._bert, self._kb = _load_models() + self._nlp_util, self._bert, self._kb, self._embeddings, self.action_taxonomy = _load_models() def extract_roles_from_label(self, label: str) -> dict: """ @@ -69,10 +75,15 @@ class Extraction: @param label: the textual value the roles should be extracted from @return: a dictionary of the form {'role type 1': ['role instance 1', 'role instance 2']} """ - + action_classifier = SlimActionClassifier(self._embeddings, self.action_taxonomy) cleaned = preprocessor.preprocess_label(label) tagged = self._bert.predict_single_label(cleaned) - return augmented_log.get_tagged(tagged[0], tagged[1], self._nlp_util) + cleaned = augmented_log.get_tagged(tagged[0], tagged[1], self._nlp_util) + if ["action:name"] in cleaned: + action = action_classifier.classify_actions(cleaned["action:name"]) + if action is not None: + cleaned["action:type"] = action + return cleaned def extract_roles_from_list_of_labels(self, labels: list) -> dict: """ @@ -80,9 +91,7 @@ class Extraction: @param labels: the textual values the roles should be extracted from @return: a dictionary of the form {'initial label': {'role type 1': ['role instance 1', 'role instance 2']}} """ - cleaned = [preprocessor.preprocess_label(label) for label in labels] - tagged = [self._bert.predict_single_label(single) for single in cleaned] - return {labels[i]: augmented_log.get_tagged(single[0], single[1], self._nlp_util) for i, single in enumerate(tagged)} + return {label: self.extract_roles_from_label(label) for label in labels} def _add_resource_and_action_types(self, aug_log, res_to_type, act_to_type): cls_to_res = {} @@ -140,7 +149,7 @@ class Extraction: toc = time.perf_counter() print(f"Preprocessed the current log in {toc - tic:0.4f} seconds") print("load word embeddings " + config.word_embeddings_file) - word_embeddings = WordEmbeddings(config=config) + word_embeddings = WordEmbeddings(config.word_embeddings_file) print("BERT-based semantic tagging") print('semantic tagging text attributes') tic = time.perf_counter()