From 80f43e69fc9f8b0497b320577bb0f3245abf1e73 Mon Sep 17 00:00:00 2001 From: Adrian Rebmann <rebmann@informatik.uni-mannheim.de> Date: Thu, 14 Dec 2023 10:43:53 +0100 Subject: [PATCH 1/2] update versions and model --- README.md | 28 +- .../actionclassification.py | 92 +----- .../attribute_classification.py | 18 +- .../resourceclassifier.py | 64 +---- .../subclassifiers/att_label_classifier.py | 76 +---- extraction/data/gathering/data_generator.py | 97 ++++++- extraction/download.py | 10 - extraction/extract.py | 4 +- .../instancelabeling/bert_tagger/__init__.py | 13 - .../bert_tagger/bert_for_label_parsing.py | 31 -- .../bert_tagger/bert_preprocessor.py | 88 ------ .../bert_tagger/bert_tagger.py | 43 ++- .../bert_tagger/bert_wrapper.py | 272 ------------------ setup.py | 22 +- 14 files changed, 164 insertions(+), 694 deletions(-) delete mode 100644 extraction/instancelabeling/bert_tagger/bert_for_label_parsing.py delete mode 100644 extraction/instancelabeling/bert_tagger/bert_preprocessor.py delete mode 100644 extraction/instancelabeling/bert_tagger/bert_wrapper.py diff --git a/README.md b/README.md index 8797ebc..34d1c23 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ -# Semantic Role Extraction from Event Data -Python package for extracting process related semantic roles from event data. <br> +# Semantic Component Extraction from Event Data +Python package for extracting process related semantic components from event data. <br> <sub> written by <a href="mailto:rebmann@informatik.uni-mannheim.de">Adrian Rebmann</a><br /> </sub> @@ -7,19 +7,19 @@ written by <a href="mailto:rebmann@informatik.uni-mannheim.de">Adrian Rebmann</a ## About The approach is described in:<br> -*Rebmann, A., & van der Aa, H. (2021). Extracting Semantic Process Information from the Natural Language in Event Logs. <br>Advanced Information Systems Engineering. CAiSE 2021* +*Rebmann, A., & van der Aa, H. (2022). Enabling Semantics-aware Process Mining through the Automatic Annotation of Event Logs. <br>Information Systems* ### Roles that are extracted, if avaiable. -| Role | Description | Example| -| ------ | ------ | ------ | -|'object:name'|The main object type(s) relevant to an event|purchase order| -|'object:instance'|The main object instances(s) relevant to an event|purchase order 123| -|'object:status'|An object’s status|open, closed| -|'action:name'|The kind of action|create, send, receive| -|'action:status'|An action’s status|started, paused| -|'org:actor:name'|The type of active resource in the event|employee, system| -|'org:actor:instance'|Information indicating the specific actor instance|employee 123| -|'org:passive:name'|The type of passive resource related to the event|A reciepient of a document| -|'org:passive:instance'|Information indicating specific passive resources|A specific reciepient of a document| +| Component type | Description | Example| +|------------------------| ------ | ------ | +| 'object:name' |The main object type(s) relevant to an event|purchase order| +| 'object:instance' |The main object instances(s) relevant to an event|purchase order 123| +| 'object:status' |An object’s status|open, closed| +| 'action:name' |The kind of action|create, send, receive| +| 'action:status' |An action’s status|started, paused| +| 'org:actor:name' |The type of active resource in the event|employee, system| +| 'org:actor:instance' |Information indicating the specific actor instance|employee 123| +| 'org:passive:name' |The type of passive resource related to the event|A reciepient of a document| +| 'org:passive:instance' |Information indicating specific passive resources|A specific reciepient of a document| # Installation 1. Install via pip: <code>pip install git+https://gitlab.uni-mannheim.de/processanalytics/semantic-role-extraction.git</code> diff --git a/extraction/attributeclassification/actionclassification.py b/extraction/attributeclassification/actionclassification.py index c6b8b69..a07dbde 100644 --- a/extraction/attributeclassification/actionclassification.py +++ b/extraction/attributeclassification/actionclassification.py @@ -2,19 +2,11 @@ import json from nltk import WordNetLemmatizer -from numpy import array -from numpy import zeros -from keras.preprocessing.text import Tokenizer -from keras.preprocessing.sequence import pad_sequences -from keras.models import Sequential -from keras.layers import Dense -from keras.layers import Flatten -from keras.layers import Embedding - -from extraction.const import ConceptType, ACTION_IDX_TO_LABEL +from extraction.const import ConceptType import operator from nltk.corpus import words + class ActionClassifier: def __init__(self, config, aug_log, embeddings): @@ -38,8 +30,6 @@ class ActionClassifier: self.upper_acts = upper_acts def classify_actions(self): - #self.produce_gs() - #self.build_classifier() return {act: self.get_action_type_for_action(act) for act in self.actions} def unique_actions_from_taxonomy(self, action_taxonomy, unique_actions, child_to_upper_level, upper_acts, upper_level=None): @@ -88,46 +78,26 @@ class ActionClassifier: return "None" sims = {} upper_level_sims = {} - #combined_sims = {} for tax_action in taxonomy_actions: try: sim = self.embeddings.embeddings.similarity(action, tax_action) - #print(action, tax_action, sim) if tax_action in upper_acts: upper_level_sims[tax_action] = sim sims[tax_action] = sim except KeyError as e: - #print(e) action = self.lemmatizer.lemmatize((action.split(" ")[-1])) try: sim = self.embeddings.embeddings.similarity(action, tax_action) - #print(action, tax_action, sim) if tax_action in upper_acts: upper_level_sims[tax_action] = sim sims[tax_action] = sim except KeyError as e: pass - #print(e, "after lemmatization still") if len(sims) == 0: return "None" - - # for u_act, u_sim in upper_level_sims.items(): - # for act, sim in sims.items(): - # if u_act in self.child_to_upper_level[act]: - # combined_sims[(u_act, act)] = u_sim + sim - max_sim = max(sims.items(), key=operator.itemgetter(1))[0] max_sim_upper = max(upper_level_sims.items(), key=operator.itemgetter(1))[0] max_sim_upper_ini = str(max_sim_upper) - #max_sim_combined = max(combined_sims.items(), key=operator.itemgetter(1))[0][0] - - #print("MAX any", action, max_sim, sims[max_sim]) - #print("MAX upper", action, max_sim_upper, sims[max_sim_upper]) - #print("MAX combi", action, max_sim_combined, sims[max_sim_combined]) - - #if sims[max_sim] <= upper_level_sims[max_sim_upper_ini]+0.05: - # max_sim = max_sim_upper_ini - if len(child_to_upper_level[max_sim]) == 1: max_sim = list(child_to_upper_level[max_sim])[0] else: @@ -136,67 +106,9 @@ class ActionClassifier: if upper_level_sims[upper_level_act] > max_sim_upper: max_sim = upper_level_act max_sim_upper = upper_level_sims[upper_level_act] - - #print("MAX top-level", action, max_sim, sims[max_sim]) - - #if sims[max_sim_any] < .5: - # return "None" return max_sim if sims[max_sim] > 0 else max_sim_upper_ini def get_action_type_for_action(self, action): return self.get_most_similar(action, self.unique_actions_taxonomy,self.child_to_upper_level, self.upper_acts) - def build_classifier(self): - with open(self.config.resource_dir + 'mitphb.json') as json_file: - action_taxonomy = json.load(json_file) - # all unique actions - unique_actions_from_taxonomy = set() - # a mapping from all unique actions to their top most ancestor(s) - child_to_upper_level = dict() - # all upper level actions - upper_acts = set() - self.unique_actions_from_taxonomy(action_taxonomy, unique_actions_from_taxonomy, child_to_upper_level, - upper_acts) - - # define documents - docs = [act for act in unique_actions_from_taxonomy] - # define class labels - label_2_idx = {lab: idx for idx, lab in ACTION_IDX_TO_LABEL.items()} - labels = array([label_2_idx[child_to_upper_level[doc].pop()] for doc in docs]) - print(docs) - print(labels) - # prepare tokenizer - t = Tokenizer() - t.fit_on_texts(docs) - vocab_size = len(t.word_index) + 1 - # integer encode the documents - encoded_docs = t.texts_to_sequences(docs) - print(encoded_docs) - # pad documents to a max length of 4 words - max_length = 2 - padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post') - print(padded_docs) - # create a weight matrix for words in training docs - embedding_matrix = zeros((vocab_size, 50)) - for word, i in t.word_index.items(): - if word in self.embeddings.embeddings: - embedding_vector = self.embeddings.embeddings[word] - embedding_matrix[i] = embedding_vector - # define model - model = Sequential() - e = Embedding(vocab_size, 50, weights=[embedding_matrix], input_length=2, trainable=False) - model.add(e) - model.add(Flatten()) - model.add(Dense(1, activation='softmax')) - # compile the model - model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) - # summarize the model - print(model.summary()) - # fit the model - model.fit(padded_docs, labels, epochs=50, verbose=0) - # evaluate the model - loss, accuracy = model.evaluate(padded_docs, labels, verbose=0) - print('Accuracy: %f' % (accuracy * 100)) - - diff --git a/extraction/attributeclassification/attribute_classification.py b/extraction/attributeclassification/attribute_classification.py index 92b7633..810e7e2 100644 --- a/extraction/attributeclassification/attribute_classification.py +++ b/extraction/attributeclassification/attribute_classification.py @@ -47,7 +47,7 @@ class AttributeClassifier: idx.append((sen_idx[1] + k)) new_sen.insert((sen_idx[1] + k), (unique_val.split()[k], entity_type)) #print(new_sen) - return classifier.predict_single_label_full(get_plain_sentence(new_sen)), idx + return classifier.predict_single_label(get_plain_sentence(new_sen)), idx return None def find_state_pattern_in_label_using_bert(self, ld: AugmentedLog, classifier): @@ -107,10 +107,7 @@ class AttributeClassifier: if curr is not None: idx = curr[1] split_res = curr[0] - res = split_res[1] - tags = res[0][0] - prob = res[1][0] - #print(entity_type, prob) + tags = split_res[1] pred = [] act = [] num_matches = 0 @@ -130,14 +127,7 @@ class AttributeClassifier: - ignores values that are only numeric, a date or contain only one character """ the_dict = {} - # self.dummy_sent(classifier) - # all_sents = sen_gen.get_expressive_sentences() - # print("*"*40) - # print(len(all_sents)) - # print("*" * 40) - # sys.exit(0) sens = sen_gen.get_defined_sentences() - #print(sens) winners = {} for att in ld.get_attributes_by_att_types(consider_for_value_classification): winners[att] = [] @@ -161,9 +151,7 @@ class AttributeClassifier: the_dict[unique_val][entity_type] = curr[0] idx = curr[1] split_res = curr[0] - res = split_res[1] - tags = res[0][0] - prob = res[1][0] + tags = split_res[1] pred = [] act = [] num_matches = 0 diff --git a/extraction/attributeclassification/resourceclassifier.py b/extraction/attributeclassification/resourceclassifier.py index 06d1093..dabccab 100644 --- a/extraction/attributeclassification/resourceclassifier.py +++ b/extraction/attributeclassification/resourceclassifier.py @@ -1,26 +1,16 @@ -import json from collections import Counter from datetime import timedelta from nltk.corpus.reader import WordNetError -from numpy import asarray, array -from numpy import zeros from simpletransformers.classification import ClassificationModel -from tensorflow.keras.preprocessing.text import Tokenizer -from tensorflow.keras.preprocessing.sequence import pad_sequences -from tensorflow.keras.models import Sequential -from tensorflow.keras.layers import Dense -from tensorflow.keras.layers import Flatten -from tensorflow.keras.layers import Embedding import pandas as pd import numpy as np from nltk.corpus import wordnet as wn -from extraction.const import ConceptType, RESOURCE_IDX_TO_LABEL, TERMS_FOR_MISSING, AttributeType, type_mapping +from extraction.const import ConceptType, RESOURCE_IDX_TO_LABEL, TERMS_FOR_MISSING, type_mapping from extraction.model.augmented_log import AugmentedLog -from extraction.preprocessing.preprocessor import preprocess_label, clean_attribute_name, check_for_uuid -from extraction.data.gathering.schemaorgextraction import read_and_extract +from extraction.preprocessing.preprocessor import preprocess_label, check_for_uuid from nltk import WordNetLemmatizer from nltk.corpus import words @@ -303,56 +293,6 @@ class ResourceClassifier: clear_preds_res_bert[res] = value return clear_preds_res, clear_preds_res_text, clear_preds_res_misc, clear_preds_res_ne, clear_preds_res_wn, clear_preds_res_bert, clear_preds_res_other - def train_and_classify(self): - actor_terms, _, _, _, _ = read_and_extract(self.config.resource_dir) - docs = [] - labels = [] - for doc in actor_terms: - docs.append(preprocess_label(doc)) - labels.append(0) - - labels = array(labels) - t = Tokenizer() - t.fit_on_texts(docs) - vocab_size = len(t.word_index) + 1 - # integer encode the documents - encoded_docs = t.texts_to_sequences(docs) - print(encoded_docs) - # pad documents to a max length of 4 words - max_length = 4 - padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post') - print(padded_docs) - # load the whole embedding into memory - embeddings_index = dict() - f = open(self.config.resource_dir + '/glove.6B.100d.txt') - for line in f: - values = line.split() - word = values[0] - coefs = asarray(values[1:], dtype='float32') - embeddings_index[word] = coefs - f.close() - print('Loaded %s word vectors.' % len(embeddings_index)) - # create a weight matrix for words in training docs - embedding_matrix = zeros((vocab_size, 100)) - for word, i in t.word_index.items(): - embedding_vector = embeddings_index.get(word) - if embedding_vector is not None: - embedding_matrix[i] = embedding_vector - # define .model - model = Sequential() - e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=4, trainable=False) - model.add(e) - model.add(Flatten()) - model.add(Dense(1, activation='sigmoid')) - # compile the .model - model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) - # summarize the .model - print(model.summary()) - # fit the .model - model.fit(padded_docs, labels, epochs=50, verbose=0) - # evaluate the .model - loss, accuracy = model.evaluate(padded_docs, labels, verbose=0) - print('Accuracy: %f' % (accuracy * 100)) def check_wordnet(self, original, word, res, cnt=0): # print(word) diff --git a/extraction/attributeclassification/subclassifiers/att_label_classifier.py b/extraction/attributeclassification/subclassifiers/att_label_classifier.py index 6afc110..be1844c 100644 --- a/extraction/attributeclassification/subclassifiers/att_label_classifier.py +++ b/extraction/attributeclassification/subclassifiers/att_label_classifier.py @@ -12,14 +12,6 @@ import numpy as np from extraction.readwrite.loader import deserialize_model from extraction.readwrite.writer import serialize_model from nltk.corpus import words -from numpy import array -from numpy import zeros -from keras.preprocessing.text import Tokenizer -from keras.preprocessing.sequence import pad_sequences -from keras.models import Sequential -from keras.layers import Dense -from keras.layers import Flatten -from keras.layers import Embedding CAiSE_VERSION = False @@ -58,15 +50,15 @@ class AttributeLabelClassifier: def with_tf_idf_and_embedding(self, eval_mode=False): # self.build_classifier() res = {} - md = deserialize_model(self.path, "att_class") - + #md = deserialize_model(self.path, "att_class") + md = False if md is False or eval_mode is True: print("Build new attribute classifier") tfidf = feature_extraction.text.TfidfVectorizer(ngram_range=(1, 2)) tfidf.fit_transform(self.d["text"].values) # test = self.test["text"].values # Now lets create a dict so that for every word in the corpus we have a corresponding IDF value - idf_dict = dict(zip(tfidf.get_feature_names(), tfidf.idf_)) + idf_dict = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_)) x_train = tfidf_glove(idf_dict, self.d["text"].values, self.embeddings) # x_test = tfidf_glove(idf_dict, test, glove) enc = LabelEncoder() @@ -98,10 +90,13 @@ class AttributeLabelClassifier: # res[plain] = "BO", 1 elif any(self.check_proper_word(tok) for tok in clean.split(" ")): x_t = tfidf_glove(idf_dict, [clean], self.embeddings) - probas = clzz.predict_proba(x_t)[0] - pred = enc.inverse_transform(clzz.predict(x_t))[0] + try: + probas = clzz.predict_proba(x_t)[0] + pred = enc.inverse_transform(clzz.predict(x_t))[0] #print(clean, plain, pred, probas[X.index(pred)]) - res[plain] = pred, probas[X.index(pred)] + res[plain] = pred, probas[X.index(pred)] + except AttributeError: + res[plain] = "X", 1 else: #print(plain, "X", 1) res[plain] = "X", 1 @@ -115,63 +110,12 @@ class AttributeLabelClassifier: def pp_doc(self, doc): return doc.replace("type", "").replace("uuid", "").replace("identity", "").replace("id", "") - def build_classifier(self): - res = {} - docs = [act for act in self.d["text"].values] - # define class labels - label_2_idx = {lab: idx for idx, lab in enumerate(self.d["y"].unique())} # TODO - print(label_2_idx) - idx_2_label = {idx: lab for lab, idx in label_2_idx.items()} # TODO - labels = array([label_2_idx[lab] for lab in self.d["y"].values]) # TODO - print(docs) - print(labels) - # prepare tokenizer - t = Tokenizer() - t.fit_on_texts(docs) - vocab_size = len(t.word_index) + 1 - # integer encode the documents - encoded_docs = t.texts_to_sequences(docs) - print(encoded_docs) - # pad documents to a max length of 4 words - max_length = 2 - padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post') - print(padded_docs) - # create a weight matrix for words in training docs - embedding_matrix = zeros((vocab_size, 50)) - for word, i in t.word_index.items(): - if word in self.embeddings: - embedding_vector = self.embeddings[word] - embedding_matrix[i] = embedding_vector - # define model - model = Sequential() - e = Embedding(vocab_size, 50, weights=[embedding_matrix], input_length=2, trainable=False) - model.add(e) - model.add(Flatten()) - model.add(Dense(1, activation='softmax')) - # compile the model - model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) - # summarize the model - print(model.summary()) - # fit the model - model.fit(padded_docs, labels, epochs=50, verbose=0) - # evaluate the model - loss, accuracy = model.evaluate(padded_docs, labels, verbose=0) - print('Accuracy: %f' % (accuracy * 100)) - - for plain in self.cols: - clean = self.prepare_name(plain) - if any(self.check_proper_word(tok) for tok in clean.split(" ")): - encoded_clean = t.texts_to_sequences([clean]) - padded_clean = pad_sequences(encoded_clean, maxlen=max_length, padding='post') - print(clean) - print(model.predict(padded_clean)) - def prepare_name(self, plain): return clean_attribute_name(plain).replace("doc", "document").replace("type", "").replace("uuid", "").replace( "identity", "").replace("id", "") def build_log_reg(train_features, y_train, alpha=1e-4): - log_reg = SGDClassifier(loss='log', alpha=alpha, n_jobs=-1, penalty='l2') + log_reg = SGDClassifier(loss='hinge', alpha=alpha, n_jobs=-1, penalty='l2') log_reg.fit(train_features, y_train) return log_reg diff --git a/extraction/data/gathering/data_generator.py b/extraction/data/gathering/data_generator.py index d13cc67..2678589 100644 --- a/extraction/data/gathering/data_generator.py +++ b/extraction/data/gathering/data_generator.py @@ -1,10 +1,99 @@ import random from extraction.const import ConceptType -import extraction.instancelabeling.bert_tagger.bert_preprocessor as bp rec_pronouns = ["to"] act_pronouns = ["by"] +def data_to_object(dataframe, cols=False, only_label=[]): + objects = [] + for index, row in dataframe.iterrows(): + if not cols: + objects.append(DataObject(row)) + else: + for col in dataframe.columns: + if (len(only_label) == 0) and ('_tags' in col): + objects.append(DataObject(row, col)) + elif len(only_label) > 0: + for consider in only_label: + if consider in col: + objects.append(DataObject(row, col)) + return objects + + +def parse_tags(item, col): + w, t = split_tags(item[col]) + return w, t + + +def split_tags(tags): + w = [] + t = [] + tag_sets = tags.split(',') + tag_sets = tag_sets[0:-1] + for tag_set in tag_sets: + word = tag_set.split('<>')[0].strip(' ') + tag = tag_set.split('<>')[1].strip(' ') + w.append(word) + t.append(tag) + return w, t + + +def to_list_format(train_objects): + label_to_list_tuple = {} + for item in train_objects: + label_to_list_tuple[item.label] = (item.split.copy(), item.tags.copy()) + return label_to_list_tuple + + +def to_tuple_format(train_objects): + train_sentences_semantic = {} + for item in train_objects: + curr = [] + for i in range(len(item.split)): + curr.append((item.split[i], item.tags[i])) + train_sentences_semantic[' '.join(item.split)] = curr + return train_sentences_semantic + + +class DataObject: + def __init__(self, item, col='Tags'): + self.split, self.tags = parse_tags(item, col) + lab = '' + for part in self.split: + lab += part + ' ' + self.label = lab.strip() + + +def prepare_data(train_objects): + train_sentences = [] + for item in train_objects: + curr = [] + for i in range(len(item.split)): + curr.append((item.split[i], item.tags[i])) + train_sentences.append(curr) + + +def prepare_tag_set(train_sentences): + tags = set([item for sublist in train_sentences for _, item in sublist]) + tag2idx = {} + idx2tag = {} + for i, tag in enumerate(sorted(tags)): + tag2idx[tag] = i + 1 + idx2tag[i + 1] = tag + + # Special character for the tags + tag2idx['[PAD]'] = 0 + idx2tag[0] = '[PAD]' + + +def _tag_sequence(sentences): + return [[t for w, t in sentence] for sentence in sentences] + + +def _text_sequence(sentences): + return [[w for w, t in sentence] for sentence in sentences] + + def get_plain_sentence(sen): res = '' @@ -16,9 +105,9 @@ def get_plain_sentence(sen): class DataGenerator: def _fill_sets(self, tagged_instances): - tagged_obj = bp.data_to_object(tagged_instances) - self.label_to_tagging = bp.to_list_format(tagged_obj) - self.originals = bp.to_tuple_format(tagged_obj) + tagged_obj = data_to_object(tagged_instances) + self.label_to_tagging = to_list_format(tagged_obj) + self.originals = to_tuple_format(tagged_obj) bos = set() actions = set() actors = set() diff --git a/extraction/download.py b/extraction/download.py index c0e1be3..5f3c750 100644 --- a/extraction/download.py +++ b/extraction/download.py @@ -24,16 +24,6 @@ def download(model): ) -def require_package(name): - try: - import pkg_resources - - pkg_resources.working_set.require(name) - return True - except: # noqa: E722 - return False - - def download_model(name): spacy.cli.download(_spacy_model) download_url = _model_repo diff --git a/extraction/extract.py b/extraction/extract.py index 78d5cde..5701770 100644 --- a/extraction/extract.py +++ b/extraction/extract.py @@ -27,6 +27,7 @@ _model_folder = "model" _glove_model = "glove.6B.100d.txt" _spacy_model = "en_core_web_lg" +_bert_model = "arebmann/model" def get_instance(): @@ -50,7 +51,7 @@ def _load_models(): nlp_util = NLPUtils(_spacy_model) def _load_bert(): - bt = BertTagger(data_path, nlp_util) + bt = BertTagger(_bert_model, nlp_util) return bt bert = _load_bert() @@ -142,6 +143,7 @@ class Extraction: word_embeddings = WordEmbeddings(config=config) print("BERT-based semantic tagging") print('semantic tagging text attributes') + tic = time.perf_counter() self._bert.get_tags_for_df(aug_log) toc = time.perf_counter() print(f"Tagged the whole data set in {tic - toc:0.4f} seconds") diff --git a/extraction/instancelabeling/bert_tagger/__init__.py b/extraction/instancelabeling/bert_tagger/__init__.py index 484ead0..e69de29 100644 --- a/extraction/instancelabeling/bert_tagger/__init__.py +++ b/extraction/instancelabeling/bert_tagger/__init__.py @@ -1,13 +0,0 @@ -from .bert_wrapper import BertWrapper -from .bert_for_label_parsing import BertForLabelParsing - -''' -the code in this module is for the most part adapted from the implementation in the context of the publication - -@inproceedings{shelmanov2019bibm, - title={Active Learning with Deep Pre-trained Models for Sequence Tagging of Clinical and Biomedical Texts}, - author={Artem Shelmanov and Vadim Liventsev and Danil Kireev and Nikita Khromov and Alexander Panchenko and Irina Fedulova and Dmitry V. Dylov}, - booktitle={Proceedings of International Conference on Bioinformatics & Biomedicine (BIBM)}, - year={2019} -} -''' \ No newline at end of file diff --git a/extraction/instancelabeling/bert_tagger/bert_for_label_parsing.py b/extraction/instancelabeling/bert_tagger/bert_for_label_parsing.py deleted file mode 100644 index e8a4bd6..0000000 --- a/extraction/instancelabeling/bert_tagger/bert_for_label_parsing.py +++ /dev/null @@ -1,31 +0,0 @@ -from transformers import BertForTokenClassification - -from torch.nn import CrossEntropyLoss - - -class BertForLabelParsing(BertForTokenClassification): - def __init__(self, config): - super().__init__(config) - - def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, - position_ids=None, head_mask=None, loss_mask=None): - outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids, - attention_mask=attention_mask, head_mask=head_mask) - sequence_output = outputs[0] - sequence_output = self.dropout(sequence_output) - logits = self.classifier(sequence_output) - outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here - if labels is not None: - loss_fct = CrossEntropyLoss() - # Only keep active parts of the loss - if attention_mask is not None: - active_loss = (attention_mask.view(-1) == 1) - if loss_mask is not None: - active_loss &= loss_mask.view(-1) - active_logits = logits.view(-1, self.num_labels)[active_loss] - active_labels = labels.view(-1)[active_loss] - loss = loss_fct(active_logits, active_labels) - else: - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - outputs = (loss,) + outputs - return outputs # (loss), scores, (hidden_states), (attentions) diff --git a/extraction/instancelabeling/bert_tagger/bert_preprocessor.py b/extraction/instancelabeling/bert_tagger/bert_preprocessor.py deleted file mode 100644 index 77ad24b..0000000 --- a/extraction/instancelabeling/bert_tagger/bert_preprocessor.py +++ /dev/null @@ -1,88 +0,0 @@ -def data_to_object(dataframe, cols=False, only_label=[]): - objects = [] - for index, row in dataframe.iterrows(): - if not cols: - objects.append(DataObject(row)) - else: - for col in dataframe.columns: - if (len(only_label) == 0) and ('_tags' in col): - objects.append(DataObject(row, col)) - elif len(only_label) > 0: - for consider in only_label: - if consider in col: - objects.append(DataObject(row, col)) - return objects - - -def parse_tags(item, col): - w, t = split_tags(item[col]) - return w, t - - -def split_tags(tags): - w = [] - t = [] - tag_sets = tags.split(',') - tag_sets = tag_sets[0:-1] - for tag_set in tag_sets: - word = tag_set.split('<>')[0].strip(' ') - tag = tag_set.split('<>')[1].strip(' ') - w.append(word) - t.append(tag) - return w, t - - -def to_list_format(train_objects): - label_to_list_tuple = {} - for item in train_objects: - label_to_list_tuple[item.label] = (item.split.copy(), item.tags.copy()) - return label_to_list_tuple - - -def to_tuple_format(train_objects): - train_sentences_semantic = {} - for item in train_objects: - curr = [] - for i in range(len(item.split)): - curr.append((item.split[i], item.tags[i])) - train_sentences_semantic[' '.join(item.split)] = curr - return train_sentences_semantic - - -class DataObject: - def __init__(self, item, col='Tags'): - self.split, self.tags = parse_tags(item, col) - lab = '' - for part in self.split: - lab += part + ' ' - self.label = lab.strip() - - -def prepare_data(train_objects): - train_sentences = [] - for item in train_objects: - curr = [] - for i in range(len(item.split)): - curr.append((item.split[i], item.tags[i])) - train_sentences.append(curr) - - -def prepare_tag_set(train_sentences): - tags = set([item for sublist in train_sentences for _, item in sublist]) - tag2idx = {} - idx2tag = {} - for i, tag in enumerate(sorted(tags)): - tag2idx[tag] = i + 1 - idx2tag[i + 1] = tag - - # Special character for the tags - tag2idx['[PAD]'] = 0 - idx2tag[0] = '[PAD]' - - -def _tag_sequence(sentences): - return [[t for w, t in sentence] for sentence in sentences] - - -def _text_sequence(sentences): - return [[w for w, t in sentence] for sentence in sentences] diff --git a/extraction/instancelabeling/bert_tagger/bert_tagger.py b/extraction/instancelabeling/bert_tagger/bert_tagger.py index 0c38767..1f2ae1a 100644 --- a/extraction/instancelabeling/bert_tagger/bert_tagger.py +++ b/extraction/instancelabeling/bert_tagger/bert_tagger.py @@ -1,10 +1,9 @@ import pandas as pd import random +from transformers import pipeline from extraction.model.augmented_log import AugmentedLog from extraction.const import consider_for_tagging, ConceptType, AttributeType, TERMS_FOR_MISSING -from extraction.instancelabeling.bert_tagger import BertWrapper, BertForLabelParsing - def get_train_val_test_split(sentences): random.shuffle(sentences) @@ -19,6 +18,24 @@ def get_train_val_test_split(sentences): return train_data, val_data, test_data +def merge_hashtags(words_list): + merged_list = [] + for entry in words_list: + word = entry['word'] + if word.startswith('##'): + merged_list[-1]['word'] += word[2:] + else: + merged_list.append(entry) + return merged_list + + +def parse_result(result): + res = [], [] + for entry in result: + res[0].append(entry['word']) + res[1].append(entry['entity']) + return res + class BertTagger: @@ -41,7 +58,7 @@ class BertTagger: return new_pred def _load_trained_model(self, path): - self.model = BertWrapper.load_serialized(path, BertForLabelParsing) + self.model = pipeline("ner", model=path) def tag_log(self, ld: AugmentedLog): return self._add_tags(ld) @@ -69,16 +86,17 @@ class BertTagger: seen_tagged = ld.tagged_labels tagged_per_attribute = {} cols_to_be_tagged = ld.get_attributes_by_att_types(consider_for_tagging) + print("cols to be tagged", cols_to_be_tagged) for v in cols_to_be_tagged: - unique_labels = [val for val in ld.att_to_unique[v] if val not in TERMS_FOR_MISSING] prefixes = set(lab.split(" ")[0] for lab in unique_labels) if len(prefixes) != 1: tagged_per_attribute[v] = {} predicted = self.predict_batch_at_once(unique_labels) + print(predicted) for unique, pred in zip(unique_labels, predicted): if unique not in seen_tagged: - pred = self.check_tok_for_object_type(unique.split(), pred) + pred = self.check_tok_for_object_type(unique.split(), pred[1]) tagged_per_attribute[v][unique] = unique.split(), pred seen_tagged[unique] = unique.split(), pred else: @@ -93,6 +111,7 @@ class BertTagger: print('semantic tagging: ' + col_name) unique_labels = ld.att_to_unique[col_name] predicted = self.predict_batch_at_once(unique_labels) + print(predicted) for unique, pred in zip(unique_labels, predicted): if unique not in seen_tagged: pred = self.check_tok_for_object_type(unique.split(), pred) @@ -108,15 +127,15 @@ class BertTagger: return tagged def predict_single_label(self, label): - split, pred = label.split(), self.model.predict([label.split()])[0][0] + merged = merge_hashtags(self.model(label)) + split, pred = parse_result(merged) pred = self.check_tok_for_object_type(split, pred) return split, pred def predict_batch_at_once(self, labels): - return self.model.predict([label.split() for label in labels])[0] - - def predict_single_label_full(self, label): - return label.split(), self.model.predict([label.split()]) + res = self.model(labels) + merged = [merge_hashtags(x) for x in res] + return [parse_result(x) for x in merged] @staticmethod def _fill_all(x, seen_tagged): @@ -129,7 +148,3 @@ class BertTagger: uniquely_tagged.append(tagging) return uniquely_tagged - def serialize_model(self): - self.model.save_serialize('./.model/') - - diff --git a/extraction/instancelabeling/bert_tagger/bert_wrapper.py b/extraction/instancelabeling/bert_tagger/bert_wrapper.py deleted file mode 100644 index bcde46c..0000000 --- a/extraction/instancelabeling/bert_tagger/bert_wrapper.py +++ /dev/null @@ -1,272 +0,0 @@ -import torch -import numpy as np -import pickle -import json -import os -from torch.utils.data import DataLoader - -from tensorflow.keras.preprocessing.sequence import pad_sequences - -from .bert_for_label_parsing import BertForLabelParsing - - -def _add_x_labels(labels, bpe_masks): - result_labels = [] - for l_sent, m_sent in zip(labels, bpe_masks): - m_sent = m_sent[1:-1] - sent_res = [] - i = 0 - for l in l_sent: - sent_res.append(l) - - i += 1 - while i < len(m_sent) and (m_sent[i] == 0): - i += 1 - sent_res.append('[PAD]') - - result_labels.append(sent_res) - - return result_labels - - -class BertWrapper: - def __init__(self, bert_model, bpe_tokenizer, idx2tag, tag2idx, - max_len=100, pred_loader_args={'num_workers' : 1}, - pred_batch_size=100): - super().__init__() - - self._bert_model = bert_model - self._bpe_tokenizer = bpe_tokenizer - self._idx2tag = idx2tag - self._tag2idx = tag2idx - self._max_len = max_len - self._pred_loader_args = pred_loader_args - self._pred_batch_size = pred_batch_size - - def _bpe_tokenize(self, words): - new_words = [] - bpe_masks = [] - for word in words: - bpe_tokens = self._bpe_tokenizer.tokenize(word) - new_words += bpe_tokens - bpe_masks += [1] + [0] * (len(bpe_tokens) - 1) - - return new_words, bpe_masks - - def _make_tokens_tensors(self, tokens, max_len): - bpe_tokens, bpe_masks = tuple(zip(*[self._bpe_tokenize(sent) for sent in tokens])) - bpe_tokens = prepare_bpe_tokens_for_bert(bpe_tokens, max_len=max_len) - bpe_masks = [[1] + masks[:max_len-2] + [1] for masks in bpe_masks] - max_len = max(len(sent) for sent in bpe_tokens) - token_ids = torch.tensor(create_tensors_for_tokens(self._bpe_tokenizer, bpe_tokens, max_len=max_len)) - token_masks = generate_masks(token_ids) - return bpe_tokens, max_len, token_ids, token_masks, bpe_masks - - def _make_label_tensors(self, labels, bpe_masks, max_len): - bpe_labels = _add_x_labels(labels, bpe_masks) - bpe_labels = prepare_bpe_labels_for_bert(bpe_labels, max_len=max_len) - label_ids = torch.tensor(create_tensors_for_labels(self._tag2idx, bpe_labels, max_len=max_len)) - loss_masks = label_ids != self._tag2idx['[PAD]'] - return label_ids, loss_masks - - def _logits_to_preds(self, logits, bpe_masks, tokens): - #print(self._idx2tag) - preds = logits.argmax(dim=2).numpy() - probs = logits.numpy().max(axis=2) - prob = [np.mean([p for p, m in zip(prob[:len(masks)], masks[:len(prob)]) if m][1:-1]) - for prob, masks in zip(probs, bpe_masks)] - try: - #print('in 1') - preds = [[self._idx2tag[(int(p))] for p, m in zip(pred[:len(masks)], masks[:len(pred)]) if m][1:-1] - for pred, masks in zip(preds, bpe_masks)] - except KeyError: - #print('in 2') - preds = [[self._idx2tag[(str(p))] for p, m in zip(pred[:len(masks)], masks[:len(pred)]) if m][1:-1] - for pred, masks in zip(preds, bpe_masks)] - preds = [pred + ['O']*(max(0, len(toks) - len(pred))) for pred, toks in zip(preds, tokens)] - return preds, prob - - def generate_tensors_for_prediction(self, evaluate, dataset_row): - dataset_row = dataset_row - labels = None - if evaluate: - tokens, labels = tuple(zip(*dataset_row)) - else: - tokens = dataset_row - - _, max_len, token_ids, token_masks, bpe_masks = self._make_tokens_tensors(tokens, self._max_len) - label_ids = None - loss_masks = None - - if evaluate: - label_ids, loss_masks = self._make_label_tensors(labels, bpe_masks, max_len) - - return token_ids, token_masks, bpe_masks, label_ids, loss_masks, tokens, labels - - def predict(self, dataset, evaluate=False, metrics=None): - if metrics is None: - metrics = [] - - self._bert_model.eval() - - dataloader = DataLoader(dataset, - collate_fn=lambda dataset_row: self.generate_tensors_for_prediction(evaluate, dataset_row), - **self._pred_loader_args, - batch_size=self._pred_batch_size) - predictions = [] - probas = [] - if evaluate: - cum_loss = 0. - true_labels = [] - for nb, tensors in enumerate(dataloader): - token_ids, token_masks, bpe_masks, label_ids, loss_masks, tokens, labels = tensors - if evaluate: - true_labels.extend(labels) - with torch.no_grad(): - token_ids = token_ids#.cuda() - token_masks = token_masks#.cuda() - - if evaluate: - label_ids = label_ids#.cuda() - loss_masks = loss_masks#.cuda() - - if type(self._bert_model) is BertForLabelParsing: - logits = self._bert_model(token_ids, - token_type_ids=None, - attention_mask=token_masks, - labels=label_ids, - loss_mask=loss_masks) - else: - logits = self._bert_model(token_ids, - token_type_ids=None, - attention_mask=token_masks, - labels=label_ids,) - if evaluate: - loss, logits = logits - cum_loss += loss.mean().item() - else: - logits = logits[0] - b_preds, b_prob = self._logits_to_preds(logits.cpu(), bpe_masks, tokens) - predictions.extend(b_preds) - probas.extend(b_prob) - if evaluate: - cum_loss /= (nb + 1) - result_metrics = [] - for metric in metrics: - result_metrics.append(metric(true_labels, predictions)) - return predictions, probas, tuple([cum_loss] + result_metrics) - else: - return predictions, probas - - def generate_tensors_for_training(self, tokens, labels): - _, max_len, token_ids, token_masks, bpe_masks = self._make_tokens_tensors(tokens, self._max_len) - label_ids, loss_masks = self._make_label_tensors(labels, bpe_masks, max_len) - return token_ids, token_masks, label_ids, loss_masks - - def generate_feature_tensors_for_prediction(self, tokens): - _, max_len, token_ids, token_masks, bpe_masks = self._make_tokens_tensors(tokens, self._max_len) - return token_ids, token_masks, bpe_masks - - def batch_loss_tensors(self, *tensors): - token_ids, token_masks, label_ids, loss_masks = tensors - token_ids = token_ids.cuda() - token_masks = token_masks.cuda() - label_ids = label_ids.cuda() - loss_masks = loss_masks.cuda() - - if type(self._bert_model) is BertForLabelParsing: - output = self._bert_model(token_ids, - token_type_ids=None, - attention_mask=token_masks, - labels=label_ids, - loss_mask=loss_masks) - else: - output = self._bert_model(token_ids, - token_type_ids=None, - attention_mask=token_masks, - labels=label_ids) - - loss = output[0] - return loss.mean() - - def batch_loss(self, tokens, labels): - token_ids, token_masks, label_ids, loss_masks = self.generate_tensors_for_training(tokens, labels) - return self.batch_loss_tensors(token_ids, None, token_masks, label_ids, loss_masks) - - def batch_logits(self, tokens): - _, max_len, token_ids, token_masks, __ = self._make_tokens_tensors(tokens, self._max_len) - token_ids = token_ids#.cuda() - token_masks = token_masks#.cuda() - - logits = self._bert_model(token_ids, - token_type_ids=None, - attention_mask=token_masks, - labels=None, - loss_mask=None)[0] - - return logits - - def save_serialize(self, save_dir_path): - if not os.path.exists(save_dir_path): - os.makedirs(save_dir_path) - - torch.save(self._bert_model.state_dict(), os.path.join(save_dir_path, 'pytorch_model.bin')) - with open(os.path.join(save_dir_path, 'bpe_tokenizer.pckl'), 'wb') as f: - pickle.dump(self._bpe_tokenizer, f) - - self._bert_model.config.save_pretrained(os.path.join(save_dir_path)) - - parameters_dict = { - 'idx2tag' : self._idx2tag, - 'tag2idx' : self._tag2idx, - 'max_len' : self._max_len, - 'pred_loader_args' : self._pred_loader_args, - 'pred_batch_size' : self._pred_batch_size - } - with open(os.path.join(save_dir_path, 'sec_parameters.json'), 'w') as f: - json.dump(parameters_dict, f) - - @classmethod - def load_serialized(cls, load_dir_path, bert_model_type): - print("loading serialized .model") - with open(os.path.join(load_dir_path, 'sec_parameters.json'), 'r') as f: - parameters_dict = json.load(f) - - bert_model = bert_model_type.from_pretrained(load_dir_path)#.cuda() - - with open(os.path.join(load_dir_path, 'bpe_tokenizer.pckl'), 'rb') as f: - bpe_tokenizer = pickle.load(f) - - return BertWrapper(bert_model, bpe_tokenizer, - idx2tag=parameters_dict['idx2tag'], - tag2idx=parameters_dict['tag2idx'], - max_len=parameters_dict['max_len'], - pred_loader_args=parameters_dict['pred_loader_args'], - pred_batch_size=parameters_dict['pred_batch_size']) - print(".model loaded") - - - -def prepare_bpe_tokens_for_bert(tokens, max_len): - return [['[CLS]'] + list(toks[:max_len - 2]) + ['[SEP]'] for toks in tokens] - - -def prepare_bpe_labels_for_bert(labels, max_len): - return [['[PAD]'] + list(ls[:max_len - 2]) + ['[PAD]'] for ls in labels] - - -def generate_masks(input_ids): - res = input_ids > 0 - return res.astype('float') if type(input_ids) is np.ndarray else res - - -def create_tensors_for_tokens(bpe_tokenizer, sents, max_len): - return pad_sequences([bpe_tokenizer.convert_tokens_to_ids(sent) for sent in sents], - maxlen=max_len, dtype='long', - truncating='post', padding='post') - - -def create_tensors_for_labels(tag2idx, labels, max_len): - return pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels], - maxlen=max_len, value=tag2idx['[PAD]'], padding='post', - dtype='long', truncating='post') diff --git a/setup.py b/setup.py index 3eb017e..49a958d 100644 --- a/setup.py +++ b/setup.py @@ -8,22 +8,16 @@ setup( 'extraction.model', 'extraction.readwrite', 'extraction.attributeclassification', 'extraction.attributeclassification.subclassifiers'], #include_package_data=True, install_requires=[ - 'pandas==1.3.1', + 'pandas', 'pm4py==2.2.16', - 'nltk==3.6.2', + 'nltk', 'numpy', - 'scikit-learn==0.24.2', - 'torch==1.9.0', - 'transformers==4.9.2', - 'tqdm==4.58.0', - 'tensorflow==2.7.0', - 'graphviz==0.17', - 'spacy==3.1.1', - 'pytorch-transformers==1.2.0', - 'plac==1.1.3', - 'wasabi==0.8.2', - 'requests==2.25.1', - 'keras', + 'scikit-learn', + 'transformers', + 'spacy', + 'wasabi', + 'plac', + 'torch', 'simpletransformers', 'matplotlib', 'gensim' -- GitLab From 35eac4d518bbc43e26be79fc66fb21d0c9f47afa Mon Sep 17 00:00:00 2001 From: Adrian Rebmann <rebmann@informatik.uni-mannheim.de> Date: Thu, 14 Dec 2023 10:52:32 +0100 Subject: [PATCH 2/2] update versions and model --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index 49a958d..60f553b 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,6 @@ setup( 'pandas', 'pm4py==2.2.16', 'nltk', - 'numpy', 'scikit-learn', 'transformers', 'spacy', -- GitLab