From 80f43e69fc9f8b0497b320577bb0f3245abf1e73 Mon Sep 17 00:00:00 2001
From: Adrian Rebmann <rebmann@informatik.uni-mannheim.de>
Date: Thu, 14 Dec 2023 10:43:53 +0100
Subject: [PATCH 1/2] update versions and model

---
 README.md                                     |  28 +-
 .../actionclassification.py                   |  92 +-----
 .../attribute_classification.py               |  18 +-
 .../resourceclassifier.py                     |  64 +----
 .../subclassifiers/att_label_classifier.py    |  76 +----
 extraction/data/gathering/data_generator.py   |  97 ++++++-
 extraction/download.py                        |  10 -
 extraction/extract.py                         |   4 +-
 .../instancelabeling/bert_tagger/__init__.py  |  13 -
 .../bert_tagger/bert_for_label_parsing.py     |  31 --
 .../bert_tagger/bert_preprocessor.py          |  88 ------
 .../bert_tagger/bert_tagger.py                |  43 ++-
 .../bert_tagger/bert_wrapper.py               | 272 ------------------
 setup.py                                      |  22 +-
 14 files changed, 164 insertions(+), 694 deletions(-)
 delete mode 100644 extraction/instancelabeling/bert_tagger/bert_for_label_parsing.py
 delete mode 100644 extraction/instancelabeling/bert_tagger/bert_preprocessor.py
 delete mode 100644 extraction/instancelabeling/bert_tagger/bert_wrapper.py

diff --git a/README.md b/README.md
index 8797ebc..34d1c23 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
-# Semantic Role Extraction from Event Data
-Python package for extracting process related semantic roles from event data. <br>
+# Semantic Component Extraction from Event Data
+Python package for extracting process related semantic components from event data. <br>
 <sub>
 written by <a href="mailto:rebmann@informatik.uni-mannheim.de">Adrian Rebmann</a><br />
 </sub>
@@ -7,19 +7,19 @@ written by <a href="mailto:rebmann@informatik.uni-mannheim.de">Adrian Rebmann</a
 
 ## About
 The approach is described in:<br>
-*Rebmann, A., & van der Aa, H. (2021). Extracting Semantic Process Information from the Natural Language in Event Logs. <br>Advanced Information Systems Engineering. CAiSE 2021*
+*Rebmann, A., & van der Aa, H. (2022). Enabling Semantics-aware Process Mining through the Automatic Annotation of Event Logs. <br>Information Systems*
 ### Roles that are extracted, if avaiable.
-| Role | Description | Example|
-| ------ | ------ | ------ |
-|'object:name'|The main object type(s) relevant to an event|purchase order|
-|'object:instance'|The main object instances(s) relevant to an event|purchase order 123|
-|'object:status'|An object’s status|open, closed|
-|'action:name'|The kind of action|create, send, receive|
-|'action:status'|An action’s status|started, paused|
-|'org:actor:name'|The type of active resource in the event|employee, system|
-|'org:actor:instance'|Information indicating the specific actor instance|employee 123|
-|'org:passive:name'|The type of passive resource related to the event|A reciepient of a document|
-|'org:passive:instance'|Information indicating specific passive resources|A specific reciepient of a document|
+| Component type         | Description | Example|
+|------------------------| ------ | ------ |
+| 'object:name'          |The main object type(s) relevant to an event|purchase order|
+| 'object:instance'      |The main object instances(s) relevant to an event|purchase order 123|
+| 'object:status'        |An object’s status|open, closed|
+| 'action:name'          |The kind of action|create, send, receive|
+| 'action:status'        |An action’s status|started, paused|
+| 'org:actor:name'       |The type of active resource in the event|employee, system|
+| 'org:actor:instance'   |Information indicating the specific actor instance|employee 123|
+| 'org:passive:name'     |The type of passive resource related to the event|A reciepient of a document|
+| 'org:passive:instance' |Information indicating specific passive resources|A specific reciepient of a document|
 
 # Installation
 1. Install via pip: <code>pip install git+https://gitlab.uni-mannheim.de/processanalytics/semantic-role-extraction.git</code>
diff --git a/extraction/attributeclassification/actionclassification.py b/extraction/attributeclassification/actionclassification.py
index c6b8b69..a07dbde 100644
--- a/extraction/attributeclassification/actionclassification.py
+++ b/extraction/attributeclassification/actionclassification.py
@@ -2,19 +2,11 @@ import json
 
 from nltk import WordNetLemmatizer
 
-from numpy import array
-from numpy import zeros
-from keras.preprocessing.text import Tokenizer
-from keras.preprocessing.sequence import pad_sequences
-from keras.models import Sequential
-from keras.layers import Dense
-from keras.layers import Flatten
-from keras.layers import Embedding
-
-from extraction.const import ConceptType, ACTION_IDX_TO_LABEL
+from extraction.const import ConceptType
 import operator
 from nltk.corpus import words
 
+
 class ActionClassifier:
 
     def __init__(self, config, aug_log, embeddings):
@@ -38,8 +30,6 @@ class ActionClassifier:
         self.upper_acts = upper_acts
 
     def classify_actions(self):
-        #self.produce_gs()
-        #self.build_classifier()
         return {act: self.get_action_type_for_action(act) for act in self.actions}
 
     def unique_actions_from_taxonomy(self, action_taxonomy, unique_actions, child_to_upper_level, upper_acts, upper_level=None):
@@ -88,46 +78,26 @@ class ActionClassifier:
             return "None"
         sims = {}
         upper_level_sims = {}
-        #combined_sims = {}
         for tax_action in taxonomy_actions:
             try:
                 sim = self.embeddings.embeddings.similarity(action, tax_action)
-                #print(action, tax_action, sim)
                 if tax_action in upper_acts:
                     upper_level_sims[tax_action] = sim
                 sims[tax_action] = sim
             except KeyError as e:
-                #print(e)
                 action = self.lemmatizer.lemmatize((action.split(" ")[-1]))
                 try:
                     sim = self.embeddings.embeddings.similarity(action, tax_action)
-                    #print(action, tax_action, sim)
                     if tax_action in upper_acts:
                         upper_level_sims[tax_action] = sim
                     sims[tax_action] = sim
                 except KeyError as e:
                     pass
-                    #print(e, "after lemmatization still")
         if len(sims) == 0:
             return "None"
-
-        # for u_act, u_sim in upper_level_sims.items():
-        #     for act, sim in sims.items():
-        #         if u_act in self.child_to_upper_level[act]:
-        #             combined_sims[(u_act, act)] = u_sim + sim
-
         max_sim = max(sims.items(), key=operator.itemgetter(1))[0]
         max_sim_upper = max(upper_level_sims.items(), key=operator.itemgetter(1))[0]
         max_sim_upper_ini = str(max_sim_upper)
-        #max_sim_combined = max(combined_sims.items(), key=operator.itemgetter(1))[0][0]
-
-        #print("MAX any",  action, max_sim, sims[max_sim])
-        #print("MAX upper",  action, max_sim_upper, sims[max_sim_upper])
-        #print("MAX combi", action, max_sim_combined, sims[max_sim_combined])
-
-        #if sims[max_sim] <= upper_level_sims[max_sim_upper_ini]+0.05:
-        #    max_sim = max_sim_upper_ini
-
         if len(child_to_upper_level[max_sim]) == 1:
             max_sim = list(child_to_upper_level[max_sim])[0]
         else:
@@ -136,67 +106,9 @@ class ActionClassifier:
                 if upper_level_sims[upper_level_act] > max_sim_upper:
                     max_sim = upper_level_act
                     max_sim_upper = upper_level_sims[upper_level_act]
-
-        #print("MAX top-level", action, max_sim, sims[max_sim])
-
-        #if sims[max_sim_any] < .5:
-        #    return "None"
         return max_sim if sims[max_sim] > 0 else max_sim_upper_ini
 
     def get_action_type_for_action(self, action):
         return self.get_most_similar(action, self.unique_actions_taxonomy,self.child_to_upper_level, self.upper_acts)
 
-    def build_classifier(self):
-        with open(self.config.resource_dir + 'mitphb.json') as json_file:
-            action_taxonomy = json.load(json_file)
-        # all unique actions
-        unique_actions_from_taxonomy = set()
-        # a mapping from all unique actions to their top most ancestor(s)
-        child_to_upper_level = dict()
-        # all upper level actions
-        upper_acts = set()
-        self.unique_actions_from_taxonomy(action_taxonomy, unique_actions_from_taxonomy, child_to_upper_level,
-                                          upper_acts)
-
-        # define documents
-        docs = [act for act in unique_actions_from_taxonomy]
-        # define class labels
-        label_2_idx = {lab: idx for idx, lab in ACTION_IDX_TO_LABEL.items()}
-        labels = array([label_2_idx[child_to_upper_level[doc].pop()] for doc in docs])
-        print(docs)
-        print(labels)
-        # prepare tokenizer
-        t = Tokenizer()
-        t.fit_on_texts(docs)
-        vocab_size = len(t.word_index) + 1
-        # integer encode the documents
-        encoded_docs = t.texts_to_sequences(docs)
-        print(encoded_docs)
-        # pad documents to a max length of 4 words
-        max_length = 2
-        padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
-        print(padded_docs)
-        # create a weight matrix for words in training docs
-        embedding_matrix = zeros((vocab_size, 50))
-        for word, i in t.word_index.items():
-            if word in self.embeddings.embeddings:
-                embedding_vector = self.embeddings.embeddings[word]
-                embedding_matrix[i] = embedding_vector
-        # define model
-        model = Sequential()
-        e = Embedding(vocab_size, 50, weights=[embedding_matrix], input_length=2, trainable=False)
-        model.add(e)
-        model.add(Flatten())
-        model.add(Dense(1, activation='softmax'))
-        # compile the model
-        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
-        # summarize the model
-        print(model.summary())
-        # fit the model
-        model.fit(padded_docs, labels, epochs=50, verbose=0)
-        # evaluate the model
-        loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
-        print('Accuracy: %f' % (accuracy * 100))
-
-
 
diff --git a/extraction/attributeclassification/attribute_classification.py b/extraction/attributeclassification/attribute_classification.py
index 92b7633..810e7e2 100644
--- a/extraction/attributeclassification/attribute_classification.py
+++ b/extraction/attributeclassification/attribute_classification.py
@@ -47,7 +47,7 @@ class AttributeClassifier:
                 idx.append((sen_idx[1] + k))
                 new_sen.insert((sen_idx[1] + k), (unique_val.split()[k], entity_type))
                 #print(new_sen)
-            return classifier.predict_single_label_full(get_plain_sentence(new_sen)), idx
+            return classifier.predict_single_label(get_plain_sentence(new_sen)), idx
         return None
 
     def find_state_pattern_in_label_using_bert(self, ld: AugmentedLog, classifier):
@@ -107,10 +107,7 @@ class AttributeClassifier:
             if curr is not None:
                 idx = curr[1]
                 split_res = curr[0]
-                res = split_res[1]
-                tags = res[0][0]
-                prob = res[1][0]
-                #print(entity_type, prob)
+                tags = split_res[1]
                 pred = []
                 act = []
                 num_matches = 0
@@ -130,14 +127,7 @@ class AttributeClassifier:
         - ignores values that are only numeric, a date or contain only one character
         """
         the_dict = {}
-        # self.dummy_sent(classifier)
-        # all_sents = sen_gen.get_expressive_sentences()
-        # print("*"*40)
-        # print(len(all_sents))
-        # print("*" * 40)
-        # sys.exit(0)
         sens = sen_gen.get_defined_sentences()
-        #print(sens)
         winners = {}
         for att in ld.get_attributes_by_att_types(consider_for_value_classification):
             winners[att] = []
@@ -161,9 +151,7 @@ class AttributeClassifier:
                             the_dict[unique_val][entity_type] = curr[0]
                             idx = curr[1]
                             split_res = curr[0]
-                            res = split_res[1]
-                            tags = res[0][0]
-                            prob = res[1][0]
+                            tags = split_res[1]
                             pred = []
                             act = []
                             num_matches = 0
diff --git a/extraction/attributeclassification/resourceclassifier.py b/extraction/attributeclassification/resourceclassifier.py
index 06d1093..dabccab 100644
--- a/extraction/attributeclassification/resourceclassifier.py
+++ b/extraction/attributeclassification/resourceclassifier.py
@@ -1,26 +1,16 @@
-import json
 from collections import Counter
 from datetime import timedelta
 
 from nltk.corpus.reader import WordNetError
-from numpy import asarray, array
-from numpy import zeros
 from simpletransformers.classification import ClassificationModel
-from tensorflow.keras.preprocessing.text import Tokenizer
-from tensorflow.keras.preprocessing.sequence import pad_sequences
-from tensorflow.keras.models import Sequential
-from tensorflow.keras.layers import Dense
-from tensorflow.keras.layers import Flatten
-from tensorflow.keras.layers import Embedding
 import pandas as pd
 import numpy as np
 
 from nltk.corpus import wordnet as wn
 
-from extraction.const import ConceptType, RESOURCE_IDX_TO_LABEL, TERMS_FOR_MISSING, AttributeType, type_mapping
+from extraction.const import ConceptType, RESOURCE_IDX_TO_LABEL, TERMS_FOR_MISSING, type_mapping
 from extraction.model.augmented_log import AugmentedLog
-from extraction.preprocessing.preprocessor import preprocess_label, clean_attribute_name, check_for_uuid
-from extraction.data.gathering.schemaorgextraction import read_and_extract
+from extraction.preprocessing.preprocessor import preprocess_label, check_for_uuid
 
 from nltk import WordNetLemmatizer
 from nltk.corpus import words
@@ -303,56 +293,6 @@ class ResourceClassifier:
                     clear_preds_res_bert[res] = value
         return clear_preds_res, clear_preds_res_text, clear_preds_res_misc, clear_preds_res_ne, clear_preds_res_wn, clear_preds_res_bert, clear_preds_res_other
 
-    def train_and_classify(self):
-        actor_terms, _, _, _, _ = read_and_extract(self.config.resource_dir)
-        docs = []
-        labels = []
-        for doc in actor_terms:
-            docs.append(preprocess_label(doc))
-            labels.append(0)
-
-        labels = array(labels)
-        t = Tokenizer()
-        t.fit_on_texts(docs)
-        vocab_size = len(t.word_index) + 1
-        # integer encode the documents
-        encoded_docs = t.texts_to_sequences(docs)
-        print(encoded_docs)
-        # pad documents to a max length of 4 words
-        max_length = 4
-        padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
-        print(padded_docs)
-        # load the whole embedding into memory
-        embeddings_index = dict()
-        f = open(self.config.resource_dir + '/glove.6B.100d.txt')
-        for line in f:
-            values = line.split()
-            word = values[0]
-            coefs = asarray(values[1:], dtype='float32')
-            embeddings_index[word] = coefs
-        f.close()
-        print('Loaded %s word vectors.' % len(embeddings_index))
-        # create a weight matrix for words in training docs
-        embedding_matrix = zeros((vocab_size, 100))
-        for word, i in t.word_index.items():
-            embedding_vector = embeddings_index.get(word)
-            if embedding_vector is not None:
-                embedding_matrix[i] = embedding_vector
-        # define .model
-        model = Sequential()
-        e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=4, trainable=False)
-        model.add(e)
-        model.add(Flatten())
-        model.add(Dense(1, activation='sigmoid'))
-        # compile the .model
-        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
-        # summarize the .model
-        print(model.summary())
-        # fit the .model
-        model.fit(padded_docs, labels, epochs=50, verbose=0)
-        # evaluate the .model
-        loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
-        print('Accuracy: %f' % (accuracy * 100))
 
     def check_wordnet(self, original, word, res, cnt=0):
         # print(word)
diff --git a/extraction/attributeclassification/subclassifiers/att_label_classifier.py b/extraction/attributeclassification/subclassifiers/att_label_classifier.py
index 6afc110..be1844c 100644
--- a/extraction/attributeclassification/subclassifiers/att_label_classifier.py
+++ b/extraction/attributeclassification/subclassifiers/att_label_classifier.py
@@ -12,14 +12,6 @@ import numpy as np
 from extraction.readwrite.loader import deserialize_model
 from extraction.readwrite.writer import serialize_model
 from nltk.corpus import words
-from numpy import array
-from numpy import zeros
-from keras.preprocessing.text import Tokenizer
-from keras.preprocessing.sequence import pad_sequences
-from keras.models import Sequential
-from keras.layers import Dense
-from keras.layers import Flatten
-from keras.layers import Embedding
 
 CAiSE_VERSION = False
 
@@ -58,15 +50,15 @@ class AttributeLabelClassifier:
     def with_tf_idf_and_embedding(self, eval_mode=False):
         # self.build_classifier()
         res = {}
-        md = deserialize_model(self.path, "att_class")
-
+        #md = deserialize_model(self.path, "att_class")
+        md = False
         if md is False or eval_mode is True:
             print("Build new attribute classifier")
             tfidf = feature_extraction.text.TfidfVectorizer(ngram_range=(1, 2))
             tfidf.fit_transform(self.d["text"].values)
             # test = self.test["text"].values
             # Now lets create a dict so that for every word in the corpus we have a corresponding IDF value
-            idf_dict = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
+            idf_dict = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))
             x_train = tfidf_glove(idf_dict, self.d["text"].values, self.embeddings)
             # x_test = tfidf_glove(idf_dict, test, glove)
             enc = LabelEncoder()
@@ -98,10 +90,13 @@ class AttributeLabelClassifier:
             #     res[plain] = "BO", 1
             elif any(self.check_proper_word(tok) for tok in clean.split(" ")):
                 x_t = tfidf_glove(idf_dict, [clean], self.embeddings)
-                probas = clzz.predict_proba(x_t)[0]
-                pred = enc.inverse_transform(clzz.predict(x_t))[0]
+                try:
+                    probas = clzz.predict_proba(x_t)[0]
+                    pred = enc.inverse_transform(clzz.predict(x_t))[0]
                 #print(clean, plain, pred, probas[X.index(pred)])
-                res[plain] = pred, probas[X.index(pred)]
+                    res[plain] = pred, probas[X.index(pred)]
+                except AttributeError:
+                    res[plain] = "X", 1
             else:
                 #print(plain, "X", 1)
                 res[plain] = "X", 1
@@ -115,63 +110,12 @@ class AttributeLabelClassifier:
     def pp_doc(self, doc):
         return doc.replace("type", "").replace("uuid", "").replace("identity", "").replace("id", "")
 
-    def build_classifier(self):
-        res = {}
-        docs = [act for act in self.d["text"].values]
-        # define class labels
-        label_2_idx = {lab: idx for idx, lab in enumerate(self.d["y"].unique())}  # TODO
-        print(label_2_idx)
-        idx_2_label = {idx: lab for lab, idx in label_2_idx.items()}  # TODO
-        labels = array([label_2_idx[lab] for lab in self.d["y"].values])  # TODO
-        print(docs)
-        print(labels)
-        # prepare tokenizer
-        t = Tokenizer()
-        t.fit_on_texts(docs)
-        vocab_size = len(t.word_index) + 1
-        # integer encode the documents
-        encoded_docs = t.texts_to_sequences(docs)
-        print(encoded_docs)
-        # pad documents to a max length of 4 words
-        max_length = 2
-        padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
-        print(padded_docs)
-        # create a weight matrix for words in training docs
-        embedding_matrix = zeros((vocab_size, 50))
-        for word, i in t.word_index.items():
-            if word in self.embeddings:
-                embedding_vector = self.embeddings[word]
-                embedding_matrix[i] = embedding_vector
-        # define model
-        model = Sequential()
-        e = Embedding(vocab_size, 50, weights=[embedding_matrix], input_length=2, trainable=False)
-        model.add(e)
-        model.add(Flatten())
-        model.add(Dense(1, activation='softmax'))
-        # compile the model
-        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
-        # summarize the model
-        print(model.summary())
-        # fit the model
-        model.fit(padded_docs, labels, epochs=50, verbose=0)
-        # evaluate the model
-        loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
-        print('Accuracy: %f' % (accuracy * 100))
-
-        for plain in self.cols:
-            clean = self.prepare_name(plain)
-            if any(self.check_proper_word(tok) for tok in clean.split(" ")):
-                encoded_clean = t.texts_to_sequences([clean])
-                padded_clean = pad_sequences(encoded_clean, maxlen=max_length, padding='post')
-                print(clean)
-                print(model.predict(padded_clean))
-
     def prepare_name(self, plain):
         return clean_attribute_name(plain).replace("doc", "document").replace("type", "").replace("uuid", "").replace(
             "identity", "").replace("id", "")
 
 
 def build_log_reg(train_features, y_train, alpha=1e-4):
-    log_reg = SGDClassifier(loss='log', alpha=alpha, n_jobs=-1, penalty='l2')
+    log_reg = SGDClassifier(loss='hinge', alpha=alpha, n_jobs=-1, penalty='l2')
     log_reg.fit(train_features, y_train)
     return log_reg
diff --git a/extraction/data/gathering/data_generator.py b/extraction/data/gathering/data_generator.py
index d13cc67..2678589 100644
--- a/extraction/data/gathering/data_generator.py
+++ b/extraction/data/gathering/data_generator.py
@@ -1,10 +1,99 @@
 import random
 from extraction.const import ConceptType
-import extraction.instancelabeling.bert_tagger.bert_preprocessor as bp
 
 rec_pronouns = ["to"]
 act_pronouns = ["by"]
 
+def data_to_object(dataframe, cols=False, only_label=[]):
+    objects = []
+    for index, row in dataframe.iterrows():
+        if not cols:
+            objects.append(DataObject(row))
+        else:
+            for col in dataframe.columns:
+                if (len(only_label) == 0) and ('_tags' in col):
+                    objects.append(DataObject(row, col))
+                elif len(only_label) > 0:
+                    for consider in only_label:
+                        if consider in col:
+                            objects.append(DataObject(row, col))
+    return objects
+
+
+def parse_tags(item, col):
+    w, t = split_tags(item[col])
+    return w, t
+
+
+def split_tags(tags):
+    w = []
+    t = []
+    tag_sets = tags.split(',')
+    tag_sets = tag_sets[0:-1]
+    for tag_set in tag_sets:
+        word = tag_set.split('<>')[0].strip(' ')
+        tag = tag_set.split('<>')[1].strip(' ')
+        w.append(word)
+        t.append(tag)
+    return w, t
+
+
+def to_list_format(train_objects):
+    label_to_list_tuple = {}
+    for item in train_objects:
+        label_to_list_tuple[item.label] = (item.split.copy(), item.tags.copy())
+    return label_to_list_tuple
+
+
+def to_tuple_format(train_objects):
+    train_sentences_semantic = {}
+    for item in train_objects:
+        curr = []
+        for i in range(len(item.split)):
+            curr.append((item.split[i], item.tags[i]))
+        train_sentences_semantic[' '.join(item.split)] = curr
+    return train_sentences_semantic
+
+
+class DataObject:
+    def __init__(self, item, col='Tags'):
+        self.split, self.tags = parse_tags(item, col)
+        lab = ''
+        for part in self.split:
+            lab += part + ' '
+        self.label = lab.strip()
+
+
+def prepare_data(train_objects):
+    train_sentences = []
+    for item in train_objects:
+        curr = []
+        for i in range(len(item.split)):
+            curr.append((item.split[i], item.tags[i]))
+        train_sentences.append(curr)
+
+
+def prepare_tag_set(train_sentences):
+    tags = set([item for sublist in train_sentences for _, item in sublist])
+    tag2idx = {}
+    idx2tag = {}
+    for i, tag in enumerate(sorted(tags)):
+        tag2idx[tag] = i + 1
+        idx2tag[i + 1] = tag
+
+    # Special character for the tags
+    tag2idx['[PAD]'] = 0
+    idx2tag[0] = '[PAD]'
+
+
+def _tag_sequence(sentences):
+    return [[t for w, t in sentence] for sentence in sentences]
+
+
+def _text_sequence(sentences):
+    return [[w for w, t in sentence] for sentence in sentences]
+
+
 
 def get_plain_sentence(sen):
     res = ''
@@ -16,9 +105,9 @@ def get_plain_sentence(sen):
 class DataGenerator:
 
     def _fill_sets(self, tagged_instances):
-        tagged_obj = bp.data_to_object(tagged_instances)
-        self.label_to_tagging = bp.to_list_format(tagged_obj)
-        self.originals = bp.to_tuple_format(tagged_obj)
+        tagged_obj = data_to_object(tagged_instances)
+        self.label_to_tagging = to_list_format(tagged_obj)
+        self.originals = to_tuple_format(tagged_obj)
         bos = set()
         actions = set()
         actors = set()
diff --git a/extraction/download.py b/extraction/download.py
index c0e1be3..5f3c750 100644
--- a/extraction/download.py
+++ b/extraction/download.py
@@ -24,16 +24,6 @@ def download(model):
     )
 
 
-def require_package(name):
-    try:
-        import pkg_resources
-
-        pkg_resources.working_set.require(name)
-        return True
-    except:  # noqa: E722
-        return False
-
-
 def download_model(name):
     spacy.cli.download(_spacy_model)
     download_url = _model_repo
diff --git a/extraction/extract.py b/extraction/extract.py
index 78d5cde..5701770 100644
--- a/extraction/extract.py
+++ b/extraction/extract.py
@@ -27,6 +27,7 @@ _model_folder = "model"
 
 _glove_model = "glove.6B.100d.txt"
 _spacy_model = "en_core_web_lg"
+_bert_model = "arebmann/model"
 
 
 def get_instance():
@@ -50,7 +51,7 @@ def _load_models():
     nlp_util = NLPUtils(_spacy_model)
 
     def _load_bert():
-        bt = BertTagger(data_path, nlp_util)
+        bt = BertTagger(_bert_model, nlp_util)
         return bt
 
     bert = _load_bert()
@@ -142,6 +143,7 @@ class Extraction:
         word_embeddings = WordEmbeddings(config=config)
         print("BERT-based semantic tagging")
         print('semantic tagging text attributes')
+        tic = time.perf_counter()
         self._bert.get_tags_for_df(aug_log)
         toc = time.perf_counter()
         print(f"Tagged the whole data set in {tic - toc:0.4f} seconds")
diff --git a/extraction/instancelabeling/bert_tagger/__init__.py b/extraction/instancelabeling/bert_tagger/__init__.py
index 484ead0..e69de29 100644
--- a/extraction/instancelabeling/bert_tagger/__init__.py
+++ b/extraction/instancelabeling/bert_tagger/__init__.py
@@ -1,13 +0,0 @@
-from .bert_wrapper import BertWrapper
-from .bert_for_label_parsing import BertForLabelParsing
-
-'''
-the code in this module is for the most part adapted from the implementation in the context of the publication
-
-@inproceedings{shelmanov2019bibm,
-    title={Active Learning with Deep Pre-trained Models for Sequence Tagging of Clinical and Biomedical Texts},
-    author={Artem Shelmanov and Vadim Liventsev and Danil Kireev and Nikita Khromov and Alexander Panchenko and Irina Fedulova and Dmitry V. Dylov},
-    booktitle={Proceedings of International Conference on Bioinformatics & Biomedicine (BIBM)},
-    year={2019}
-}
-'''
\ No newline at end of file
diff --git a/extraction/instancelabeling/bert_tagger/bert_for_label_parsing.py b/extraction/instancelabeling/bert_tagger/bert_for_label_parsing.py
deleted file mode 100644
index e8a4bd6..0000000
--- a/extraction/instancelabeling/bert_tagger/bert_for_label_parsing.py
+++ /dev/null
@@ -1,31 +0,0 @@
-from transformers import BertForTokenClassification
-
-from torch.nn import CrossEntropyLoss
-
-
-class BertForLabelParsing(BertForTokenClassification):
-    def __init__(self, config):
-        super().__init__(config)
-
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
-                position_ids=None, head_mask=None, loss_mask=None):
-        outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
-                            attention_mask=attention_mask, head_mask=head_mask)
-        sequence_output = outputs[0]
-        sequence_output = self.dropout(sequence_output)
-        logits = self.classifier(sequence_output)
-        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = (attention_mask.view(-1) == 1) 
-                if loss_mask is not None:
-                    active_loss &= loss_mask.view(-1)
-                active_logits = logits.view(-1, self.num_labels)[active_loss]
-                active_labels = labels.view(-1)[active_loss]
-                loss = loss_fct(active_logits, active_labels)
-            else:
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            outputs = (loss,) + outputs
-        return outputs  # (loss), scores, (hidden_states), (attentions)
diff --git a/extraction/instancelabeling/bert_tagger/bert_preprocessor.py b/extraction/instancelabeling/bert_tagger/bert_preprocessor.py
deleted file mode 100644
index 77ad24b..0000000
--- a/extraction/instancelabeling/bert_tagger/bert_preprocessor.py
+++ /dev/null
@@ -1,88 +0,0 @@
-def data_to_object(dataframe, cols=False, only_label=[]):
-    objects = []
-    for index, row in dataframe.iterrows():
-        if not cols:
-            objects.append(DataObject(row))
-        else:
-            for col in dataframe.columns:
-                if (len(only_label) == 0) and ('_tags' in col):
-                    objects.append(DataObject(row, col))
-                elif len(only_label) > 0:
-                    for consider in only_label:
-                        if consider in col:
-                            objects.append(DataObject(row, col))
-    return objects
-
-
-def parse_tags(item, col):
-    w, t = split_tags(item[col])
-    return w, t
-
-
-def split_tags(tags):
-    w = []
-    t = []
-    tag_sets = tags.split(',')
-    tag_sets = tag_sets[0:-1]
-    for tag_set in tag_sets:
-        word = tag_set.split('<>')[0].strip(' ')
-        tag = tag_set.split('<>')[1].strip(' ')
-        w.append(word)
-        t.append(tag)
-    return w, t
-
-
-def to_list_format(train_objects):
-    label_to_list_tuple = {}
-    for item in train_objects:
-        label_to_list_tuple[item.label] = (item.split.copy(), item.tags.copy())
-    return label_to_list_tuple
-
-
-def to_tuple_format(train_objects):
-    train_sentences_semantic = {}
-    for item in train_objects:
-        curr = []
-        for i in range(len(item.split)):
-            curr.append((item.split[i], item.tags[i]))
-        train_sentences_semantic[' '.join(item.split)] = curr
-    return train_sentences_semantic
-
-
-class DataObject:
-    def __init__(self, item, col='Tags'):
-        self.split, self.tags = parse_tags(item, col)
-        lab = ''
-        for part in self.split:
-            lab += part + ' '
-        self.label = lab.strip()
-
-
-def prepare_data(train_objects):
-    train_sentences = []
-    for item in train_objects:
-        curr = []
-        for i in range(len(item.split)):
-            curr.append((item.split[i], item.tags[i]))
-        train_sentences.append(curr)
-
-
-def prepare_tag_set(train_sentences):
-    tags = set([item for sublist in train_sentences for _, item in sublist])
-    tag2idx = {}
-    idx2tag = {}
-    for i, tag in enumerate(sorted(tags)):
-        tag2idx[tag] = i + 1
-        idx2tag[i + 1] = tag
-
-    # Special character for the tags
-    tag2idx['[PAD]'] = 0
-    idx2tag[0] = '[PAD]'
-
-
-def _tag_sequence(sentences):
-    return [[t for w, t in sentence] for sentence in sentences]
-
-
-def _text_sequence(sentences):
-    return [[w for w, t in sentence] for sentence in sentences]
diff --git a/extraction/instancelabeling/bert_tagger/bert_tagger.py b/extraction/instancelabeling/bert_tagger/bert_tagger.py
index 0c38767..1f2ae1a 100644
--- a/extraction/instancelabeling/bert_tagger/bert_tagger.py
+++ b/extraction/instancelabeling/bert_tagger/bert_tagger.py
@@ -1,10 +1,9 @@
 import pandas as pd
 import random
+from transformers import pipeline
 from extraction.model.augmented_log import AugmentedLog
 from extraction.const import consider_for_tagging, ConceptType, AttributeType, TERMS_FOR_MISSING
 
-from extraction.instancelabeling.bert_tagger import BertWrapper, BertForLabelParsing
-
 
 def get_train_val_test_split(sentences):
     random.shuffle(sentences)
@@ -19,6 +18,24 @@ def get_train_val_test_split(sentences):
     return train_data, val_data, test_data
 
 
+def merge_hashtags(words_list):
+    merged_list = []
+    for entry in words_list:
+        word = entry['word']
+        if word.startswith('##'):
+            merged_list[-1]['word'] += word[2:]
+        else:
+            merged_list.append(entry)
+    return merged_list
+
+
+def parse_result(result):
+    res = [], []
+    for entry in result:
+        res[0].append(entry['word'])
+        res[1].append(entry['entity'])
+    return res
+
 
 class BertTagger:
 
@@ -41,7 +58,7 @@ class BertTagger:
         return new_pred
 
     def _load_trained_model(self, path):
-        self.model = BertWrapper.load_serialized(path, BertForLabelParsing)
+        self.model = pipeline("ner", model=path)
 
     def tag_log(self, ld: AugmentedLog):
         return self._add_tags(ld)
@@ -69,16 +86,17 @@ class BertTagger:
         seen_tagged = ld.tagged_labels
         tagged_per_attribute = {}
         cols_to_be_tagged = ld.get_attributes_by_att_types(consider_for_tagging)
+        print("cols to be tagged", cols_to_be_tagged)
         for v in cols_to_be_tagged:
-
             unique_labels = [val for val in ld.att_to_unique[v] if val not in TERMS_FOR_MISSING]
             prefixes = set(lab.split(" ")[0] for lab in unique_labels)
             if len(prefixes) != 1:
                 tagged_per_attribute[v] = {}
                 predicted = self.predict_batch_at_once(unique_labels)
+                print(predicted)
                 for unique, pred in zip(unique_labels, predicted):
                     if unique not in seen_tagged:
-                        pred = self.check_tok_for_object_type(unique.split(), pred)
+                        pred = self.check_tok_for_object_type(unique.split(), pred[1])
                         tagged_per_attribute[v][unique] = unique.split(), pred
                         seen_tagged[unique] = unique.split(), pred
             else:
@@ -93,6 +111,7 @@ class BertTagger:
         print('semantic tagging: ' + col_name)
         unique_labels = ld.att_to_unique[col_name]
         predicted = self.predict_batch_at_once(unique_labels)
+        print(predicted)
         for unique, pred in zip(unique_labels, predicted):
             if unique not in seen_tagged:
                 pred = self.check_tok_for_object_type(unique.split(), pred)
@@ -108,15 +127,15 @@ class BertTagger:
         return tagged
 
     def predict_single_label(self, label):
-        split, pred = label.split(), self.model.predict([label.split()])[0][0]
+        merged = merge_hashtags(self.model(label))
+        split, pred = parse_result(merged)
         pred = self.check_tok_for_object_type(split, pred)
         return split, pred
 
     def predict_batch_at_once(self, labels):
-        return self.model.predict([label.split() for label in labels])[0]
-
-    def predict_single_label_full(self, label):
-        return label.split(), self.model.predict([label.split()])
+        res = self.model(labels)
+        merged = [merge_hashtags(x) for x in res]
+        return [parse_result(x) for x in merged]
 
     @staticmethod
     def _fill_all(x, seen_tagged):
@@ -129,7 +148,3 @@ class BertTagger:
         uniquely_tagged.append(tagging)
         return uniquely_tagged
 
-    def serialize_model(self):
-        self.model.save_serialize('./.model/')
-
-
diff --git a/extraction/instancelabeling/bert_tagger/bert_wrapper.py b/extraction/instancelabeling/bert_tagger/bert_wrapper.py
deleted file mode 100644
index bcde46c..0000000
--- a/extraction/instancelabeling/bert_tagger/bert_wrapper.py
+++ /dev/null
@@ -1,272 +0,0 @@
-import torch
-import numpy as np
-import pickle
-import json
-import os
-from torch.utils.data import DataLoader
-
-from tensorflow.keras.preprocessing.sequence import pad_sequences
-
-from .bert_for_label_parsing import BertForLabelParsing
-
-
-def _add_x_labels(labels, bpe_masks):
-    result_labels = []
-    for l_sent, m_sent in zip(labels, bpe_masks):
-        m_sent = m_sent[1:-1]
-        sent_res = []
-        i = 0
-        for l in l_sent:
-            sent_res.append(l)
-
-            i += 1
-            while i < len(m_sent) and (m_sent[i] == 0):
-                i += 1
-                sent_res.append('[PAD]')
-
-        result_labels.append(sent_res)
-
-    return result_labels
-
-
-class BertWrapper:
-    def __init__(self, bert_model, bpe_tokenizer, idx2tag, tag2idx, 
-                 max_len=100, pred_loader_args={'num_workers' : 1}, 
-                 pred_batch_size=100):
-        super().__init__()
-        
-        self._bert_model = bert_model
-        self._bpe_tokenizer = bpe_tokenizer
-        self._idx2tag = idx2tag
-        self._tag2idx = tag2idx
-        self._max_len = max_len
-        self._pred_loader_args = pred_loader_args
-        self._pred_batch_size = pred_batch_size
-        
-    def _bpe_tokenize(self, words):
-        new_words = []
-        bpe_masks = []
-        for word in words:
-            bpe_tokens = self._bpe_tokenizer.tokenize(word)
-            new_words += bpe_tokens        
-            bpe_masks += [1] + [0] * (len(bpe_tokens) - 1)
-
-        return new_words, bpe_masks
-        
-    def _make_tokens_tensors(self, tokens, max_len):
-        bpe_tokens, bpe_masks = tuple(zip(*[self._bpe_tokenize(sent) for sent in tokens]))
-        bpe_tokens = prepare_bpe_tokens_for_bert(bpe_tokens, max_len=max_len)
-        bpe_masks = [[1] + masks[:max_len-2] + [1] for masks in bpe_masks]
-        max_len = max(len(sent) for sent in bpe_tokens)
-        token_ids = torch.tensor(create_tensors_for_tokens(self._bpe_tokenizer, bpe_tokens, max_len=max_len))
-        token_masks = generate_masks(token_ids)
-        return bpe_tokens, max_len, token_ids, token_masks, bpe_masks
-
-    def _make_label_tensors(self, labels, bpe_masks, max_len):
-        bpe_labels = _add_x_labels(labels, bpe_masks)
-        bpe_labels = prepare_bpe_labels_for_bert(bpe_labels, max_len=max_len)
-        label_ids = torch.tensor(create_tensors_for_labels(self._tag2idx, bpe_labels, max_len=max_len))
-        loss_masks = label_ids != self._tag2idx['[PAD]']
-        return label_ids, loss_masks
-    
-    def _logits_to_preds(self, logits, bpe_masks, tokens):
-        #print(self._idx2tag)
-        preds = logits.argmax(dim=2).numpy()
-        probs = logits.numpy().max(axis=2)
-        prob = [np.mean([p for p, m in zip(prob[:len(masks)], masks[:len(prob)]) if m][1:-1])  
-                for prob, masks in zip(probs, bpe_masks)]
-        try:
-          #print('in 1')
-          preds = [[self._idx2tag[(int(p))] for p, m in zip(pred[:len(masks)], masks[:len(pred)]) if m][1:-1] 
-                  for pred, masks in zip(preds, bpe_masks)]
-        except KeyError:
-          #print('in 2')
-          preds = [[self._idx2tag[(str(p))] for p, m in zip(pred[:len(masks)], masks[:len(pred)]) if m][1:-1] 
-                  for pred, masks in zip(preds, bpe_masks)]
-        preds = [pred + ['O']*(max(0, len(toks) - len(pred))) for pred, toks in zip(preds, tokens)]
-        return preds, prob
-    
-    def generate_tensors_for_prediction(self, evaluate, dataset_row):
-        dataset_row = dataset_row
-        labels = None
-        if evaluate:
-            tokens, labels = tuple(zip(*dataset_row))
-        else:
-            tokens = dataset_row
-            
-        _, max_len, token_ids, token_masks, bpe_masks = self._make_tokens_tensors(tokens, self._max_len)
-        label_ids = None
-        loss_masks = None
-            
-        if evaluate:
-            label_ids, loss_masks = self._make_label_tensors(labels, bpe_masks, max_len)
-        
-        return token_ids, token_masks, bpe_masks, label_ids, loss_masks, tokens, labels
-    
-    def predict(self, dataset, evaluate=False, metrics=None):
-        if metrics is None:
-            metrics = []
-        
-        self._bert_model.eval()
-        
-        dataloader = DataLoader(dataset, 
-                                collate_fn=lambda dataset_row: self.generate_tensors_for_prediction(evaluate, dataset_row), 
-                               **self._pred_loader_args, 
-                                batch_size=self._pred_batch_size)
-        predictions = []
-        probas = []
-        if evaluate:
-            cum_loss = 0.
-            true_labels = []
-        for nb, tensors in enumerate(dataloader):
-            token_ids, token_masks, bpe_masks, label_ids, loss_masks, tokens, labels = tensors
-            if evaluate:
-                true_labels.extend(labels)
-            with torch.no_grad():
-                token_ids = token_ids#.cuda()
-                token_masks = token_masks#.cuda()
-                
-                if evaluate:
-                    label_ids = label_ids#.cuda()
-                    loss_masks = loss_masks#.cuda()
-    
-                if type(self._bert_model) is BertForLabelParsing:
-                    logits = self._bert_model(token_ids, 
-                                              token_type_ids=None,
-                                              attention_mask=token_masks,
-                                              labels=label_ids,
-                                              loss_mask=loss_masks)
-                else:
-                    logits = self._bert_model(token_ids, 
-                                              token_type_ids=None,
-                                              attention_mask=token_masks,
-                                              labels=label_ids,)
-                if evaluate:
-                    loss, logits = logits
-                    cum_loss += loss.mean().item()
-                else:
-                    logits = logits[0]
-                b_preds, b_prob = self._logits_to_preds(logits.cpu(), bpe_masks, tokens)
-            predictions.extend(b_preds)
-            probas.extend(b_prob)
-        if evaluate: 
-            cum_loss /= (nb + 1)
-            result_metrics = []
-            for metric in metrics:
-                result_metrics.append(metric(true_labels, predictions))
-            return predictions, probas, tuple([cum_loss] + result_metrics)
-        else:
-            return predictions, probas
-        
-    def generate_tensors_for_training(self, tokens, labels):
-        _, max_len, token_ids, token_masks, bpe_masks = self._make_tokens_tensors(tokens, self._max_len)
-        label_ids, loss_masks = self._make_label_tensors(labels, bpe_masks, max_len)
-        return token_ids, token_masks, label_ids, loss_masks
-    
-    def generate_feature_tensors_for_prediction(self, tokens):
-        _, max_len, token_ids, token_masks, bpe_masks = self._make_tokens_tensors(tokens, self._max_len)
-        return token_ids, token_masks, bpe_masks
-
-    def batch_loss_tensors(self, *tensors):
-        token_ids, token_masks, label_ids, loss_masks = tensors
-        token_ids = token_ids.cuda()
-        token_masks = token_masks.cuda()
-        label_ids = label_ids.cuda()
-        loss_masks = loss_masks.cuda()
-        
-        if type(self._bert_model) is BertForLabelParsing:
-            output = self._bert_model(token_ids, 
-                                    token_type_ids=None,
-                                    attention_mask=token_masks, 
-                                    labels=label_ids,
-                                    loss_mask=loss_masks)
-        else:
-            output = self._bert_model(token_ids, 
-                                    token_type_ids=None, 
-                                    attention_mask=token_masks, 
-                                    labels=label_ids)
-        
-        loss = output[0]
-        return loss.mean()
-        
-    def batch_loss(self, tokens, labels):
-        token_ids, token_masks, label_ids, loss_masks = self.generate_tensors_for_training(tokens, labels)
-        return self.batch_loss_tensors(token_ids, None, token_masks, label_ids, loss_masks)
-    
-    def batch_logits(self, tokens):
-        _, max_len, token_ids, token_masks, __ = self._make_tokens_tensors(tokens, self._max_len)
-        token_ids = token_ids#.cuda()
-        token_masks = token_masks#.cuda()
-        
-        logits = self._bert_model(token_ids, 
-                                token_type_ids=None,
-                                attention_mask=token_masks, 
-                                labels=None,
-                                loss_mask=None)[0]
-
-        return logits
-    
-    def save_serialize(self, save_dir_path):
-        if not os.path.exists(save_dir_path):
-            os.makedirs(save_dir_path)
-        
-        torch.save(self._bert_model.state_dict(), os.path.join(save_dir_path, 'pytorch_model.bin'))
-        with open(os.path.join(save_dir_path, 'bpe_tokenizer.pckl'), 'wb') as f:
-            pickle.dump(self._bpe_tokenizer, f)
-            
-        self._bert_model.config.save_pretrained(os.path.join(save_dir_path))
-        
-        parameters_dict = {
-            'idx2tag' : self._idx2tag,
-            'tag2idx' : self._tag2idx,
-            'max_len' : self._max_len,
-            'pred_loader_args' : self._pred_loader_args,
-            'pred_batch_size' : self._pred_batch_size
-        }
-        with open(os.path.join(save_dir_path, 'sec_parameters.json'), 'w') as f:
-            json.dump(parameters_dict, f)
-
-    @classmethod
-    def load_serialized(cls, load_dir_path, bert_model_type):
-        print("loading serialized .model")
-        with open(os.path.join(load_dir_path, 'sec_parameters.json'), 'r') as f:
-            parameters_dict = json.load(f)
-         
-        bert_model = bert_model_type.from_pretrained(load_dir_path)#.cuda()
-        
-        with open(os.path.join(load_dir_path, 'bpe_tokenizer.pckl'), 'rb') as f:
-            bpe_tokenizer = pickle.load(f)
-        
-        return BertWrapper(bert_model, bpe_tokenizer,
-                           idx2tag=parameters_dict['idx2tag'],
-                           tag2idx=parameters_dict['tag2idx'],
-                           max_len=parameters_dict['max_len'],
-                           pred_loader_args=parameters_dict['pred_loader_args'],
-                           pred_batch_size=parameters_dict['pred_batch_size'])
-        print(".model loaded")
-
-
-
-def prepare_bpe_tokens_for_bert(tokens, max_len):
-    return [['[CLS]'] + list(toks[:max_len - 2]) + ['[SEP]'] for toks in tokens]
-
-
-def prepare_bpe_labels_for_bert(labels, max_len):
-    return [['[PAD]'] + list(ls[:max_len - 2]) + ['[PAD]'] for ls in labels]
-
-
-def generate_masks(input_ids):
-    res = input_ids > 0
-    return res.astype('float') if type(input_ids) is np.ndarray else res
-
-
-def create_tensors_for_tokens(bpe_tokenizer, sents, max_len):
-    return pad_sequences([bpe_tokenizer.convert_tokens_to_ids(sent) for sent in sents], 
-                         maxlen=max_len, dtype='long', 
-                         truncating='post', padding='post')
-
-
-def create_tensors_for_labels(tag2idx, labels, max_len):
-    return pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
-                         maxlen=max_len, value=tag2idx['[PAD]'], padding='post',
-                         dtype='long', truncating='post')
diff --git a/setup.py b/setup.py
index 3eb017e..49a958d 100644
--- a/setup.py
+++ b/setup.py
@@ -8,22 +8,16 @@ setup(
               'extraction.model', 'extraction.readwrite', 'extraction.attributeclassification', 'extraction.attributeclassification.subclassifiers'],
     #include_package_data=True,
     install_requires=[
-        'pandas==1.3.1',
+        'pandas',
         'pm4py==2.2.16',
-        'nltk==3.6.2',
+        'nltk',
         'numpy',
-        'scikit-learn==0.24.2',
-        'torch==1.9.0',
-        'transformers==4.9.2',
-        'tqdm==4.58.0',
-        'tensorflow==2.7.0',
-        'graphviz==0.17',
-        'spacy==3.1.1',
-        'pytorch-transformers==1.2.0',
-        'plac==1.1.3',
-        'wasabi==0.8.2',
-        'requests==2.25.1',
-        'keras',
+        'scikit-learn',
+        'transformers',
+        'spacy',
+        'wasabi',
+        'plac',
+        'torch',
         'simpletransformers',
         'matplotlib',
         'gensim'
-- 
GitLab


From 35eac4d518bbc43e26be79fc66fb21d0c9f47afa Mon Sep 17 00:00:00 2001
From: Adrian Rebmann <rebmann@informatik.uni-mannheim.de>
Date: Thu, 14 Dec 2023 10:52:32 +0100
Subject: [PATCH 2/2] update versions and model

---
 setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/setup.py b/setup.py
index 49a958d..60f553b 100644
--- a/setup.py
+++ b/setup.py
@@ -11,7 +11,6 @@ setup(
         'pandas',
         'pm4py==2.2.16',
         'nltk',
-        'numpy',
         'scikit-learn',
         'transformers',
         'spacy',
-- 
GitLab